In [59]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForMaskedLM
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [60]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('/kaggle/input/apartment-rental-offers-in-germany/immo_data.csv')
data.head(10)

In [61]:
data.shape

In [62]:
data.drop(columns=['scoutId', 'date']).duplicated()

### Column's descriptions:
**regio1:** Bundesland

**serviceCharge:** aucilliary costs such as electricty or internet in €

**heatingType:** Type of heating

**telekomTvOffer:** Is payed TV included if so which offer

**telekomHybridUploadSpeed:** how fast is the hybrid inter upload speed

**newlyConst:** is the building newly constructed

**balcony:** does the object have a balcony

**picturecount:** how many pictures were uploaded to the listing

**pricetrend:** price trend as calculated by Immoscout

**telekomUploadSpeed:** how fast is the internet upload speed

**totalRent:** total rent (usually a sum of base rent, service charge and heating cost)

**yearConstructed:** construction year

**scoutId:** immoscout Id

**noParkSpaces:** number of parking spaces

**firingTypes:** main energy sources, separated by colon

**hasKitchen:** has a kitchen

**geo_bln:** bundesland (state), same as regio1

**cellar:** has a cellar

**yearConstructedRange:** binned construction year, 1 to 9

**baseRent:** base rent without electricity and heating

**houseNumber:** house number

**livingSpace:** living space in sqm

**geo_krs:** district, above ZIP code

**condition:** condition of the flat

**interiorQual:** interior quality

**petsAllowed:** are pets allowed, can be yes, no or negotiable

**street:** street name

**streetPlain:** street name (plain, different formating)

**lift:** is elevator available

**baseRentRange:** binned base rent, 1 to 9

**typeOfFlat:** type of flat

**geo_plz:** ZIP code

**noRooms:** number of rooms

**thermalChar:** energy need in kWh/(m^2a), defines the energy efficiency class

**floor:** which floor is the flat on

**numberOfFloors:** number of floors in the building

**noRoomsRange:** binned number of rooms, 1 to 5

**garden:** has a garden

**livingSpaceRange:** binned living space, 1 to 7

**regio2:** District or Kreis, same as geo krs

**regio3:** City/town

**description:** free text description of the object

**facilities:** free text description about available facilities

**heatingCosts:** monthly heating costs in €

**energyEfficiencyClass:** energy efficiency class (based on binned thermalChar, deprecated since Feb 2020)

**lastRefurbish:** year of last renovation

**electricityBasePrice:** monthly base price for electricity in € (deprecated since Feb 2020)

**electricityKwhPrice:** electricity price per kwh (deprecated since Feb 2020)

**date:** time of scraping

# **Data pre-processing**

#### Drop columns which do not provide useful information

In [63]:
#'geo_bln' and 'regio1' are same columns due to columns description
#'street' and 'streetPlain' provide the same information
#'geo_krs' and 'regio2' provide the same information
data.drop(columns=['description', 'facilities', 'scoutId', 'geo_plz', 'geo_bln','geo_krs',
                   'houseNumber', 'street', 'yearConstructedRange', 'pricetrend', 'picturecount',
                  'streetPlain', 'firingTypes', 'date'], inplace=True)

In [64]:
data.info()

In [65]:
data.describe()

#### Delete data that do not make any sense

In [66]:
#due to data description, there are some buildings with minimum livingSpace of 0!!
data.drop(data[data.livingSpace == 0].index, inplace=True)

In [67]:
#due to data description, there are some buildings with maximum yearConstructed 2090!!
#since the test date was in 2019 maximum year of construction should be 2019.
data.drop(data[data.yearConstructed > 2019].index, inplace=True)

In [68]:
#due to data description, there are some buildings with minimum totalRent of 0!!
data.drop(data[data.totalRent == 0].index, inplace=True)

In [69]:
data.shape

#### Checking the percentage of null values

In [70]:
data.isnull().mean()*100

#### Filling null values of totalRent using baseRent, serviceCharge, heatingCoost

In [71]:
data.totalRent.fillna((data.baseRent + data.serviceCharge + data.heatingCosts), inplace=True)

In [72]:
data.totalRent.isnull().mean()*100

In [73]:
#We want to predict totalRent so we drop rows without this value
data.dropna(subset=['totalRent'], axis=0, inplace=True)

#### Drop columns with more than 35% null values

In [74]:
data = data.loc[:, data.isnull().mean()*100 < 35]
data.shape

In [75]:
data.describe()

In [76]:
data.isnull().mean()*100

#### Fill null values in condition, telekomTvOffer, heatingType, typeOfFlat, floor columns

In [77]:
data.condition.unique()

In [78]:
data.condition.fillna('other', inplace=True)

In [79]:
data.telekomTvOffer.value_counts()

In [80]:
data.telekomTvOffer.fillna('unk', inplace=True)

In [81]:
data.heatingType.fillna('unk', inplace=True)

In [82]:
data.typeOfFlat.fillna('other', inplace=True)

#### Fill null values in serviceCharge and serviceCharge and yearConstructed and floor columns

In [83]:
data.serviceCharge.fillna(data.serviceCharge.mean(), inplace=True)

In [84]:
data.telekomUploadSpeed.fillna(data.telekomUploadSpeed.mean(), inplace=True)

In [85]:
data.yearConstructed.fillna(data.yearConstructed.mean(), inplace=True)

In [86]:
data.floor.fillna(data.floor.mean(), inplace=True)

#### Checking for duplicated rows

In [87]:
data.drop_duplicates(inplace = True)
data.shape

#### Delete outliers

In [88]:
for cols in data.columns:
    if data[cols].dtype == 'int64' or data[cols].dtype == 'float64':
        Q1 = data[cols].quantile(0.25)
        Q3 = data[cols].quantile(0.75)
        IQR = Q3 - Q1
        lower_range = Q1 - 1.5 * IQR
        upper_range = Q3 + 1.5 * IQR
        
        indexs = data[(data[cols] > upper_range) | (data[cols] < lower_range)].index
        data.drop(indexs, inplace=True)
data.shape

#### Performing sentiment analysis on description column

In [89]:
#tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

#model = AutoModelForMaskedLM.from_pretrained("bert-base-german-cased")

##### **To be continued ...**

# **Data visualization**

In [90]:
data.head(10)

In [91]:
data.condition.nunique()

In [92]:
plt.figure(figsize=(8, 6))
sns.regplot(x=data["totalRent"], y=data["livingSpace"])

In [93]:
plt.figure(figsize=(8, 6))
sns.regplot(x=data["livingSpace"], y=data["noRooms"])

In [94]:
sns.catplot(x="lift", y="yearConstructed", jitter=False, data=data)

In [95]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=data["lift"], y=data["yearConstructed"], color='pink')
plt.show()

In [97]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=data["hasKitchen"], y=data["totalRent"], color='pink')
plt.show()

In [98]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=data["garden"], y=data["totalRent"], color='pink')
plt.show()

In [99]:
plt.figure(figsize=(6, 8))
sns.set(style="darkgrid")
sns.boxplot(x=data["cellar"], y=data["totalRent"], color='pink')
plt.show()

In [100]:
plt.figure(figsize=(8, 10))
sns.set(style="darkgrid")
sns.boxplot(x=data["condition"], y=data["totalRent"], color='pink')
plt.tick_params(axis='x', rotation=90)
plt.show()

In [101]:
plt.figure(figsize=(8, 10))
sns.set(style="darkgrid")
sns.boxplot(x=data["typeOfFlat"], y=data["totalRent"], color='pink')
plt.tick_params(axis='x', rotation=90)
plt.show()

In [102]:
g = sns.catplot(x="totalRent", y="lift", row="floor",
                kind="box", orient="h", height=1.5, aspect=4,
                data=data)
g.set(xscale="log")

In [103]:
data.regio1.value_counts().sort_values().plot(kind = 'barh')

In [104]:
plt.figure(figsize=(8, 10))
sns.set(style="darkgrid")
sns.boxplot(x=data["regio1"], y=data["totalRent"], color='pink')
plt.tick_params(axis='x', rotation=90)
plt.show()

# **Model**

#### One-hot categorical data

In [105]:
data.info()

In [106]:
data.drop(columns=['regio2', 'regio3'], inplace=True)

In [107]:
data.info()

In [108]:
categorical_cols = ['regio1', 'heatingType', 'telekomTvOffer', 'condition', 'typeOfFlat', 'floor']

In [109]:
categorical_subset = data[categorical_cols]
categorical_subset = pd.get_dummies(categorical_subset)
categorical_subset.head()

In [110]:
df = data.copy()

In [111]:
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, categorical_subset], axis=1)
df.shape

#### Split data

In [112]:
sep = int(0.8 * len(df))
train = df[:sep]
test = df[sep:]

In [113]:
l_train = df.totalRent
l_test = df.totalRent

train.drop(columns='totalRent', inplace=True)
test.drop(columns='totalRent', inplace=True)

#### Scaling

In [114]:
trans = StandardScaler()

s_train = trans.fit_transform(train)
s_test = trans.transform(test)

s_train = pd.DataFrame(s_train, columns=train.columns)
s_test = pd.DataFrame(s_test, columns=test.columns)

s_train

In [115]:
pca = PCA(n_components = 0.8)
 
pca_s_train = pca.fit_transform(s_train)
pca_s_test = pca.transform(s_test)

In [116]:
logreg = LogisticRegression(multi_class='ovr')
logreg.fit(pca_s_train, l_train)

In [None]:
lgr_predict = logreg.predict(test)
logreg.score(test, l_test)