In [51]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForMaskedLM
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [52]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('/kaggle/input/apartment-rental-offers-in-germany/immo_data.csv')
data.head(10)

In [53]:
data.shape

In [54]:
data.drop(columns=['scoutId', 'date']).duplicated()

### Column's descriptions:
**regio1:** Bundesland

**serviceCharge:** aucilliary costs such as electricty or internet in €

**heatingType:** Type of heating

**telekomTvOffer:** Is payed TV included if so which offer

**telekomHybridUploadSpeed:** how fast is the hybrid inter upload speed

**newlyConst:** is the building newly constructed

**balcony:** does the object have a balcony

**picturecount:** how many pictures were uploaded to the listing

**pricetrend:** price trend as calculated by Immoscout

**telekomUploadSpeed:** how fast is the internet upload speed

**totalRent:** total rent (usually a sum of base rent, service charge and heating cost)

**yearConstructed:** construction year

**scoutId:** immoscout Id

**noParkSpaces:** number of parking spaces

**firingTypes:** main energy sources, separated by colon

**hasKitchen:** has a kitchen

**geo_bln:** bundesland (state), same as regio1

**cellar:** has a cellar

**yearConstructedRange:** binned construction year, 1 to 9

**baseRent:** base rent without electricity and heating

**houseNumber:** house number

**livingSpace:** living space in sqm

**geo_krs:** district, above ZIP code

**condition:** condition of the flat

**interiorQual:** interior quality

**petsAllowed:** are pets allowed, can be yes, no or negotiable

**street:** street name

**streetPlain:** street name (plain, different formating)

**lift:** is elevator available

**baseRentRange:** binned base rent, 1 to 9

**typeOfFlat:** type of flat

**geo_plz:** ZIP code

**noRooms:** number of rooms

**thermalChar:** energy need in kWh/(m^2a), defines the energy efficiency class

**floor:** which floor is the flat on

**numberOfFloors:** number of floors in the building

**noRoomsRange:** binned number of rooms, 1 to 5

**garden:** has a garden

**livingSpaceRange:** binned living space, 1 to 7

**regio2:** District or Kreis, same as geo krs

**regio3:** City/town

**description:** free text description of the object

**facilities:** free text description about available facilities

**heatingCosts:** monthly heating costs in €

**energyEfficiencyClass:** energy efficiency class (based on binned thermalChar, deprecated since Feb 2020)

**lastRefurbish:** year of last renovation

**electricityBasePrice:** monthly base price for electricity in € (deprecated since Feb 2020)

**electricityKwhPrice:** electricity price per kwh (deprecated since Feb 2020)

**date:** time of scraping

# **Data pre-processing**

#### Drop columns which do not provide useful information

In [55]:
#'geo_bln' and 'regio1' are same columns due to columns description
#'street' and 'streetPlain' provide the same information
#'geo_krs' and 'regio2' provide the same information
data.drop(columns=['description', 'facilities', 'scoutId', 'geo_plz', 'geo_bln','geo_krs',
                   'houseNumber', 'street', 'yearConstructedRange', 'pricetrend', 'picturecount',
                  'streetPlain', 'firingTypes', 'date'], inplace=True)

In [56]:
data.info()

In [57]:
data.describe()

#### Delete data that do not make any sense

In [58]:
#due to data description, there are some buildings with minimum livingSpace of 0!!
data.drop(data[data.livingSpace == 0].index, inplace=True)

In [59]:
#due to data description, there are some buildings with maximum yearConstructed 2090!!
#since the test date was in 2019 maximum year of construction should be 2019.
data.drop(data[data.yearConstructed > 2019].index, inplace=True)

In [60]:
#due to data description, there are some buildings with minimum totalRent of 0!!
data.drop(data[data.totalRent == 0].index, inplace=True)

In [61]:
data.shape

#### Checking the percentage of null values

In [62]:
data.isnull().mean()*100

#### Filling null values of totalRent using baseRent, serviceCharge, heatingCoost

In [63]:
data.totalRent.fillna((data.baseRent + data.serviceCharge + data.heatingCosts), inplace=True)

In [64]:
data.totalRent.isnull().mean()*100

In [65]:
#We want to predict totalRent so we drop rows without this value
data.dropna(subset=['totalRent'], axis=0, inplace=True)

#### Drop columns with more than 35% null values

In [66]:
data = data.loc[:, data.isnull().mean()*100 < 35]
data.shape

In [67]:
data.describe()

In [68]:
data.isnull().mean()*100

#### Fill null values in condition, telekomTvOffer, heatingType, typeOfFlat, floor columns

In [69]:
data.condition.unique()

In [70]:
data.condition.fillna('other', inplace=True)

In [71]:
data.telekomTvOffer.value_counts()

In [72]:
data.telekomTvOffer.fillna('unk', inplace=True)

In [73]:
data.heatingType.fillna('unk', inplace=True)

In [74]:
data.typeOfFlat.fillna('other', inplace=True)

#### Fill null values in serviceCharge and telekomUploadSpeed and yearConstructed and floor columns

In [75]:
data.serviceCharge.fillna(data.serviceCharge.mean(), inplace=True)

In [76]:
data.telekomUploadSpeed.fillna(data.telekomUploadSpeed.mean(), inplace=True)

In [77]:
data.yearConstructed.fillna(data.yearConstructed.mean(), inplace=True)

In [78]:
data.floor.fillna(data.floor.mean(), inplace=True)

#### Checking for duplicated rows

In [79]:
data.drop_duplicates(inplace = True)
data.shape

#### Delete outliers

In [80]:
for cols in data.columns:
    if data[cols].dtype == 'int64' or data[cols].dtype == 'float64':
        Q1 = data[cols].quantile(0.25)
        Q3 = data[cols].quantile(0.75)
        IQR = Q3 - Q1
        lower_range = Q1 - 1.5 * IQR
        upper_range = Q3 + 1.5 * IQR
        
        indexs = data[(data[cols] > upper_range) | (data[cols] < lower_range)].index
        data.drop(indexs, inplace=True)
data.shape

#### Performing sentiment analysis on description column

In [81]:
#tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

#model = AutoModelForMaskedLM.from_pretrained("bert-base-german-cased")

##### **To be continued ...**

# **Model**

In [82]:
new_data = data[['serviceCharge', 'heatingType', 'telekomUploadSpeed', 'totalRent']]

In [83]:
new_data

#### One-hot categorical data

In [84]:
new_data.shape

In [85]:
new_data.info()

In [86]:
new_data.info()

In [87]:
categorical_cols = ['heatingType']

In [88]:
categorical_subset = new_data[categorical_cols]
categorical_subset = pd.get_dummies(categorical_subset)
categorical_subset.head()

In [89]:
df = new_data.copy()

In [90]:
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, categorical_subset], axis=1)
df.shape

#### Split data

In [91]:
sep = int(0.8 * len(df))
train = df[:sep]
test = df[sep:]

In [92]:
l_train = train.totalRent
l_test = test.totalRent

train.drop(columns='totalRent', inplace=True)
test.drop(columns='totalRent', inplace=True)

#### Scaling

In [93]:
trans = StandardScaler()

s_train = trans.fit_transform(train)
s_test = trans.transform(test)

s_train = pd.DataFrame(s_train, columns=train.columns)
s_test = pd.DataFrame(s_test, columns=test.columns)

s_train

#### Multivariate linear regression

In [94]:
class LinearRegression:

  def __init__(self, learning_rate=0.1, epochs=100, accuracy_rate=0.1):
    self.learning_rate = learning_rate
    self.epochs = epochs
    self.accuracy_rate = accuracy_rate
    self.history = {'loss': [], 'accuracy': []}

  def gradient(self, X, y):
    return 2/X.shape[0] * np.dot(X.T, (np.dot(X, self.weights) - y))

  def fit(self, X, y, Xval, yval):
    train = []
    X = np.array(X)
    for i in range(len(X)):
      x = list(X[i])
      x.insert(0, 1)
      train.append(np.array(x))
    train = np.array(train)
    self.weights = np.random.rand((train.shape[1]))
    for i in range(self.epochs + 1):
      pred = self.predict(Xval)
      err = mean_squared_error(yval, pred)
      acc = self.calculate_accuracy(pred, yval)
      self.history['loss'].append(err)
      self.history['accuracy'].append(acc)
      self.weights = self.weights - self.learning_rate * self.gradient(train, y)

  def predict(self, X):
    pred = []
    X=np.array(X)
    for i in range(len(X)):
      x = list(X[i])
      x.insert(0, 1)
      pred.append(np.array(x))
    return np.dot(pred, self.weights)

  def calculate_accuracy(self, pred, val):
    right = 0
    val = np.array(val)
    for i in range(len(pred)):        
      if abs(pred[i] - val[i]) <= val[i] * self.accuracy_rate:
        right += 1
    return right / len(pred)

In [95]:
model = LinearRegression()
model.fit(s_train, l_train, s_test, l_test)
y_pred = model.predict(s_test)

In [96]:
print(model.calculate_accuracy(y_pred, l_test))

In [97]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(s_train, l_train)
reg.score(s_test, l_test)

In [98]:
from sklearn.linear_model import Lasso

reg_lasso = Lasso(alpha=1.0)
reg_lasso.fit(s_train, l_train)
reg_lasso.score(s_test, l_test)

In [99]:
from sklearn.linear_model import Ridge

reg_ridge = Ridge(alpha=1.0)
reg_ridge.fit(s_train, l_train)
reg_ridge.score(s_test, l_test)