In [61]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
sns.set_style("dark")
sns.set(rc={'figure.figsize':(12,8)})
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [62]:
df = pd.read_csv("data/immo_data.csv")
df.head(2)

Unnamed: 0,regio1,serviceCharge,heatingType,telekomTvOffer,telekomHybridUploadSpeed,newlyConst,balcony,picturecount,pricetrend,telekomUploadSpeed,totalRent,yearConstructed,scoutId,noParkSpaces,firingTypes,hasKitchen,geo_bln,cellar,yearConstructedRange,baseRent,houseNumber,livingSpace,geo_krs,condition,interiorQual,petsAllowed,street,streetPlain,lift,baseRentRange,typeOfFlat,geo_plz,noRooms,thermalChar,floor,numberOfFloors,noRoomsRange,garden,livingSpaceRange,regio2,regio3,description,facilities,heatingCosts,energyEfficiencyClass,lastRefurbish,electricityBasePrice,electricityKwhPrice,date
0,Nordrhein_Westfalen,245.0,central_heating,ONE_YEAR_FREE,,False,False,6,4.62,10.0,840.0,1965.0,96107057,1.0,oil,False,Nordrhein_Westfalen,True,2.0,595.0,244.0,86.0,Dortmund,well_kept,normal,,Sch&uuml;ruferstra&szlig;e,Schüruferstraße,False,4,ground_floor,44269,4.0,181.4,1.0,3.0,4,True,4,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,Die Wohnung ist mit Laminat ausgelegt. Das Bad...,,,,,,May19
1,Rheinland_Pfalz,134.0,self_contained_central_heating,ONE_YEAR_FREE,,False,True,8,3.47,10.0,,1871.0,111378734,2.0,gas,False,Rheinland_Pfalz,False,1.0,800.0,,89.0,Rhein_Pfalz_Kreis,refurbished,normal,no,no_information,,False,5,ground_floor,67459,3.0,,,,3,False,4,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,,,,2019.0,,,May19


In [63]:
from preprocess import data_cleaner, handle_categorical_vars, reduce_mem_usage, drop_str_vars

df = data_cleaner(df)
df = handle_categorical_vars(df)
df = reduce_mem_usage(df)
df = drop_str_vars(df)
df.info()

Shape dataframe before drop section:  (268850, 49)
Shape dataframe after drop section:  (255782, 37)
13068 instances were droped.           (4.860702994234703 % of the whole dataset.)           15 columns were droped, and 2 were added.
Memory usage of dataframe is 61.71 MB
Memory usage after optimization is: 31.467 MB
Decreased by 49.0%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 255782 entries, 96107057 to 110938302
Data columns (total 57 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   serviceCharge                               255782 non-null  float32
 1   telekomTvOffer                              255782 non-null  int8   
 2   newlyConst                                  255782 non-null  bool   
 3   balcony                                     255782 non-null  bool   
 4   picturecount                                255782 non-null  int8   
 5   pricetrend       

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import linreg
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler 

In [65]:
def epsilon_sensitive(value, epsilon):
    value[value < epsilon] = 0
    return np.sum(value)        

In [66]:
cols = ['serviceCharge', 'heatingType_central_heating', 'heatingType_district_heating',
        'heatingType_floor_heating', 'heatingType_gas_heating', 'heatingType_no_heating',
        'heatingType_other', 'heatingType_self_contained_central_heating', 'telekomUploadSpeed']

X = df[cols]
y = df['totalRent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

results_cols = ['Model', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE']
results = pd.DataFrame(columns=results_cols)

models = [
        linreg.LinearRegression(n_iter=1000, init_theta=np.matrix(np.random.randn((X_train.shape[1] + 1))).T,
                                cost=lambda h, y: ((np.power((h - y), 2)).mean()) / 2, name="Linear Regression custom MSE"),
        linreg.LinearRegression(n_iter=1000, init_theta=np.matrix(np.random.randn((X_train.shape[1] + 1))).T,
                                cost=lambda h, y: (np.sum(np.abs((y - h)))) / 2, name="Linear Regression custom AE"),
        linreg.LinearRegression(n_iter=1000, init_theta=np.matrix(np.random.randn((X_train.shape[1] + 1))).T,
                                cost=lambda h, y: epsilon_sensitive(np.abs(y - h), 1), name="Linear Regression custom Epsilon sensitive"),
        LinearRegression(),
        Ridge(),
        Lasso()
        ]

predictions_train, predictions_test = [], []
for index, model in enumerate(models):
    if isinstance(model, linreg.LinearRegression):
        results.loc[index, 'Model'] = model.name

        X_train_linreg = np.c_[np.ones((X_train.shape[0], 1)), X_train]
        X_test_linreg = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_train_linreg = np.expand_dims(y_train, axis=-1)
        y_test_linreg = np.expand_dims(y_test, axis=-1)

        model.fit(X_train_linreg, y_train_linreg)
        predictions_train.append(model.predict(X_train_linreg))
        results.loc[index, 'Train MSE'] = mean_squared_error(y_train_linreg, predictions_train[index])
        results.loc[index, 'Train MAE'] = mean_absolute_error(y_train_linreg, predictions_train[index])

        predictions_test.append(model.predict(X_test_linreg))
        results.loc[index, 'Test MSE'] = mean_squared_error(y_test_linreg, predictions_test[index])
        results.loc[index, 'Test MAE'] = mean_absolute_error(y_test_linreg, predictions_test[index])
    else:
        results.loc[index, 'Model'] = model.__class__.__name__

        model.fit(X_train, y_train)
        predictions_train.append(model.predict(X_train))
        results.loc[index, 'Train MSE'] = mean_squared_error(y_train, predictions_train[index])
        results.loc[index, 'Train MAE'] = mean_absolute_error(y_train, predictions_train[index])


        predictions_test.append(model.predict(X_test))
        results.loc[index, 'Test MSE'] = mean_squared_error(y_test, predictions_test[index])
        results.loc[index, 'Test MAE'] = mean_absolute_error(y_test, predictions_test[index])

In [70]:
results

Unnamed: 0,Model,Train MSE,Test MSE,Train MAE,Test MAE
0,Linear Regression custom MSE,192197.922092,189528.16983,318.174305,318.792085
1,Linear Regression custom AE,192197.922095,189528.17365,318.174295,318.792074
2,Linear Regression custom Epsilon sensitive,192197.922093,189528.150508,318.174284,318.792063
3,LinearRegression,192197.90625,189527.9375,318.185638,318.80365
4,Ridge,192197.9375,189527.828125,318.183441,318.801117
5,Lasso,192205.28125,189672.1875,318.394501,319.03183
