# Downloading dataset

In [1]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d corrieaar/apartment-rental-offers-in-germany

Downloading apartment-rental-offers-in-germany.zip to /content
 95% 88.0M/93.0M [00:04<00:00, 28.1MB/s]
100% 93.0M/93.0M [00:04<00:00, 21.9MB/s]


In [3]:
!unzip apartment-rental-offers-in-germany.zip

Archive:  apartment-rental-offers-in-germany.zip
  inflating: immo_data.csv           


# Loading Dataset 

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
dataset = pd.read_csv('immo_data.csv')

In [6]:
dataset.head()

Unnamed: 0,regio1,serviceCharge,heatingType,telekomTvOffer,telekomHybridUploadSpeed,newlyConst,balcony,picturecount,pricetrend,telekomUploadSpeed,...,regio2,regio3,description,facilities,heatingCosts,energyEfficiencyClass,lastRefurbish,electricityBasePrice,electricityKwhPrice,date
0,Nordrhein_Westfalen,245.0,central_heating,ONE_YEAR_FREE,,False,False,6,4.62,10.0,...,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,Die Wohnung ist mit Laminat ausgelegt. Das Bad...,,,,,,May19
1,Rheinland_Pfalz,134.0,self_contained_central_heating,ONE_YEAR_FREE,,False,True,8,3.47,10.0,...,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,,,,2019.0,,,May19
2,Sachsen,255.0,floor_heating,ONE_YEAR_FREE,10.0,True,True,8,2.72,2.4,...,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,"* 9 m² Balkon\n* Bad mit bodengleicher Dusche,...",,,,,,Oct19
3,Sachsen,58.15,district_heating,ONE_YEAR_FREE,,False,True,9,1.53,40.0,...,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,,87.23,,,,,May19
4,Bremen,138.0,self_contained_central_heating,,,False,True,19,2.46,,...,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,Diese Wohnung wurde neu saniert und ist wie fo...,,,,,,Feb20


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268850 entries, 0 to 268849
Data columns (total 49 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   regio1                    268850 non-null  object 
 1   serviceCharge             261941 non-null  float64
 2   heatingType               223994 non-null  object 
 3   telekomTvOffer            236231 non-null  object 
 4   telekomHybridUploadSpeed  45020 non-null   float64
 5   newlyConst                268850 non-null  bool   
 6   balcony                   268850 non-null  bool   
 7   picturecount              268850 non-null  int64  
 8   pricetrend                267018 non-null  float64
 9   telekomUploadSpeed        235492 non-null  float64
 10  totalRent                 228333 non-null  float64
 11  yearConstructed           211805 non-null  float64
 12  scoutId                   268850 non-null  int64  
 13  noParkSpaces              93052 non-null   f

#Preprocessing

We use preprocessing steps from previous assignment

In [8]:
dataset.isnull().sum()/len(dataset)

regio1                      0.000000
serviceCharge               0.025698
heatingType                 0.166844
telekomTvOffer              0.121328
telekomHybridUploadSpeed    0.832546
newlyConst                  0.000000
balcony                     0.000000
picturecount                0.000000
pricetrend                  0.006814
telekomUploadSpeed          0.124077
totalRent                   0.150705
yearConstructed             0.212182
scoutId                     0.000000
noParkSpaces                0.653889
firingTypes                 0.211880
hasKitchen                  0.000000
geo_bln                     0.000000
cellar                      0.000000
yearConstructedRange        0.212182
baseRent                    0.000000
houseNumber                 0.264155
livingSpace                 0.000000
geo_krs                     0.000000
condition                   0.254748
interiorQual                0.419063
petsAllowed                 0.426160
street                      0.000000
s

In [9]:
bad_cols = dataset.columns[dataset.isnull().sum()/len(dataset) > 0.5]

In [10]:
dataset.drop(columns=bad_cols,inplace=True)

In [11]:
unuseful_cols = ['date','description','scoutId','picturecount',"facilities",'houseNumber',
                  'livingSpaceRange','yearConstructedRange','baseRentRange','noRoomsRange']
dataset.drop(columns=unuseful_cols,inplace=True)

In [12]:
dataset.drop(['street','streetPlain'],axis=1,inplace=True)

In [13]:
dataset.drop_duplicates(keep='last',inplace=True)

In [14]:
dataset.duplicated().sum()

0

In [15]:
dataset.totalRent.fillna(dataset.totalRent.median(),inplace=True)

In [16]:
dataset.drop(dataset[dataset.baseRent >= dataset.totalRent].index,inplace=True)

In [17]:
dataset.drop(dataset[dataset.totalRent == 0 ].index,inplace=True)

In [18]:
dataset.drop(dataset[dataset['totalRent']>10000].index,inplace=True)

In [19]:
dataset.drop(dataset[dataset['livingSpace'] == 0].index,inplace=True)

In [20]:
numerical_cols = dataset.select_dtypes(exclude=['bool','object'])

In [21]:
dataset.fillna(numerical_cols.mean(),inplace=True)

In [22]:
categorical_cols = dataset.select_dtypes(include=['bool','object'])

In [23]:
dataset.fillna(categorical_cols.mode().iloc[0],inplace= True)

In [24]:
def detect_outlier(df,col_name):
  """
  Find indexes of outliers by computing z-score

  Parameters
  ----------
  df : pd.DataFrame
       dataframe of dataset 
  col_name : str
             name of column you want to find outliers in
  Returns
  -------
  indexes : pd.Index
            indexes of outliers
  """

  upper = df[col_name].mean() + 3 * df[col_name].std()
  lower = df[col_name].mean() - 3 * df[col_name].std()

  indexes = df[(df[col_name] > upper) | (df[col_name] < lower)].index
  return indexes

In [25]:
for col in dataset.columns:
  if dataset[col].dtype == 'int64' or dataset[col].dtype == 'float64':
    outlier_indexes = detect_outlier(dataset,col)
    dataset.drop(outlier_indexes,inplace=True)

In [26]:
dataset.drop(columns=['firingTypes','geo_krs','regio2','regio3','geo_bln'],inplace=True)

In [27]:
def new_label_regio1(col):
  """
  Return new label of regio1 column for instance

  Parameters
  ----------
  col : pd.Series
        regio1 column of dataset
  
  Returns
  -------
  int
      return other if regio1  of instance is one of selected names otherwise col
  """
  if col in ['Berlin','Brandenburg','Mecklenburg_Vorpommern',
             'Schleswig_Holstein','Hamburg','Bremen','Saarland']:
    return 'other'
  else:
    return col

dataset['new_regio1'] = dataset.regio1.apply(new_label_regio1)
dataset.drop('regio1',axis=1,inplace=True)

In [28]:
def new_label_heatingType(col):
  """
  Return new label of heatingType column for instance

  Parameters
  ----------
  col : pd.Series
        heatingType column of dataset
  
  Returns
  -------
  int
      return other if heatingType of instance is not one of selected names otherwise col
  """
  if col not in ['central_heating','district_heating','gas_heating','self_contained_central_heating','floor_heating','oil_heating']:
    return 'other'
  else:
    return col


dataset['new_heatingType'] = dataset.heatingType.apply(new_label_heatingType)
dataset.drop(columns='heatingType',inplace=True)

In [29]:
def new_label_condition(col):
  """
  Return new label of condition column for instance

  Parameters
  ----------
  col : pd.Series
        condition column of dataset
  
  Returns
  -------
  int
      return other if condition of instance is one of selected names otherwise col
  """
  if col in ['negotiable','need_of_renovation','ripe_for_demolition']:
    return 'other'
  else:
    return col

dataset['new_condition'] = dataset.condition.apply(new_label_condition)
dataset.drop(columns='condition',inplace=True)

In [30]:
def new_label_typeOfFlat(col):
  """
  Return new label of typeOfFlat column for instance

  Parameters
  ----------
  col : pd.Series
        typeOfFlat column of dataset
  
  Returns
  -------
  int
      return other if typeOfFlat of instance is one of selected names otherwise col
  """
  if col  in ['penthouse','half_basement','loft']:
    return 'other'
  else:
    return col
dataset['new_typeOfFlat'] = dataset.typeOfFlat.apply(new_label_typeOfFlat)
dataset.drop(columns='typeOfFlat',inplace=True)

In [31]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230510 entries, 0 to 268848
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   serviceCharge       230510 non-null  float64
 1   telekomTvOffer      230510 non-null  object 
 2   newlyConst          230510 non-null  bool   
 3   balcony             230510 non-null  bool   
 4   pricetrend          230510 non-null  float64
 5   telekomUploadSpeed  230510 non-null  float64
 6   totalRent           230510 non-null  float64
 7   yearConstructed     230510 non-null  float64
 8   hasKitchen          230510 non-null  bool   
 9   cellar              230510 non-null  bool   
 10  baseRent            230510 non-null  float64
 11  livingSpace         230510 non-null  float64
 12  interiorQual        230510 non-null  object 
 13  petsAllowed         230510 non-null  object 
 14  lift                230510 non-null  bool   
 15  geo_plz             230510 non-nul

# Linear Regression

In [55]:
class LinearRegression():
  """
    Linear Regression model with mean squared error cost function

    Parameters
    ----------

    learning_rate : float
                    learning rate of gradient descent algorithm
    n_iter : int
             number of iterations or epoch of gradient descent algorithm
    Attributes
    ----------
    lr : float
         learning rate of gradient descent algorithm
    n_iter : int
             number of iterations or epoch of gradient descent algorithm
    weights : numpy.array
              weights of our model are initialized with random numbers in [0,1]  
    bias : float
           bias of our model which is initialized with 0


    Methods
    -------
    fit(X,y)
        training the weights with regards to dataset (X and y)
    predict(X)
            predict target values corresponding to X
    _compute_gradient(X,y_true,y_predicted)
                      computing gradients of mse cost function

  """

  def __init__(self,learning_rate = 0.01,n_iter=2000):
    self.lr = learning_rate
    self.n_iter = n_iter
    self.weights = None
    self.bias = None
  # computing gradients of mse cost function
  def _compute_gradient(self,X,y_true,y_predicted):
    n = X.shape[0]
    dw = (1/n) * np.dot(X.T,(y_predicted-y_true))
    db = (1/n) * np.sum(y_predicted-y_true)
    return dw , db

  def predict(self,X):
    return np.dot(X,self.weights) + self.bias

  def fit(self,X,y):
    # initializing weights and bias
    self.weights = np.random.rand(X.shape[1])
    
    self.bias = 0 

    for _ in range(self.n_iter):

      y_predicted = self.predict(X)

      dw , db = self._compute_gradient(X,y,y_predicted)
      # updating weights and bias
      self.weights -= self.lr * dw
      self.bias -= self.lr * db

    

Train-test split

In [33]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
my_lr = LinearRegression(n_iter=2500)
data = dataset.loc[:,['serviceCharge',"telekomUploadSpeed",'new_heatingType']]
target = dataset['totalRent']

data_train, data_test, target_train, target_test = train_test_split(data,target,test_size=0.2,random_state=42)

train_cat_feature = pd.get_dummies(data_train[['new_heatingType']])
train_numeric_features = scaler.fit_transform(data_train[['serviceCharge',"telekomUploadSpeed"]])
train_numeric_features = pd.DataFrame(train_numeric_features,columns=['serviceCharge',"telekomUploadSpeed"],index= data_train.index)
data_train  = pd.concat([train_numeric_features,train_cat_feature],axis=1)

my_lr.fit(data_train,target_train)

In [34]:
test_cat_feature = pd.get_dummies(data_test[['new_heatingType']])
test_numeric_features = scaler.transform(data_test[['serviceCharge',"telekomUploadSpeed"]])
test_numeric_features = pd.DataFrame(test_numeric_features,columns=['serviceCharge',"telekomUploadSpeed"],index= data_test.index)
data_test  = pd.concat([test_numeric_features,test_cat_feature],axis=1)


In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
y_predicted = my_lr.predict(data_test)

my_mse = mean_squared_error(target_test,y_predicted)

print('MSE of my Linear Regression:',my_mse)

MSE of my Linear Regression: 56289.23199168802


In [36]:
from sklearn.linear_model import LinearRegression

sklearn_lr = LinearRegression()
sklearn_lr.fit(data_train,target_train)

LinearRegression()

In [37]:
y_predicted_sklearn = sklearn_lr.predict(data_test)
sklearn_mse = mean_squared_error(target_test,y_predicted_sklearn)
print('MSE of sklearn Linear Regression:',sklearn_mse)

MSE of sklearn Linear Regression: 56007.67574790318


In [38]:
y_predicted_sklearn = sklearn_lr.predict(data_test)
sklearn_mae = mean_absolute_error(target_test,y_predicted_sklearn)
print('MAE of sklearn Linear Regression:',sklearn_mae)

MAE of sklearn Linear Regression: 180.5081399105849


# Lasso and Ridge

We train Ridge and Lasso with previous selected features

In [39]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

data = dataset[['serviceCharge',"telekomUploadSpeed",'new_heatingType']]
target = dataset['totalRent']

data_train, data_test, target_train, target_test = train_test_split(data,target,test_size=0.2,random_state=42)
cat_selector = make_column_selector(dtype_include=['bool','object'])
num_selector = make_column_selector(dtype_exclude=['bool','object'])
cat_cols = cat_selector(data_train)
numeric_cols = num_selector(data_train)
preprocessor  = ColumnTransformer([('cat_preprocessor',OneHotEncoder(),cat_cols),
                                   ('num_preprocessor',StandardScaler(),numeric_cols)])


In [40]:
ridge = Pipeline([('preprocessor',preprocessor), ('ridge',Ridge())])
param_grid = {
    'ridge__alpha' : [0.001,0.01,0.1,0,1,10,100,1000]
}
grid_search = GridSearchCV(ridge,param_grid= param_grid,cv=5,n_jobs=-1,scoring = 'neg_mean_squared_error')
grid_search.fit(data_train,target_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat_preprocessor',
                                                                         OneHotEncoder(),
                                                                         ['new_heatingType']),
                                                                        ('num_preprocessor',
                                                                         StandardScaler(),
                                                                         ['serviceCharge',
                                                                          'telekomUploadSpeed'])])),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'ridge__alpha': [0.001, 0.01, 0.1, 0, 1, 10, 100,
                                          1000]},
             scoring='neg_mean_squared_error')

In [41]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res = cv_res[['mean_fit_time','mean_score_time','param_ridge__alpha', 'params','mean_test_score', 'std_test_score',
       'rank_test_score']]
cv_res.loc[:,'mean_test_score'] = -cv_res['mean_test_score']
cv_res.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,param_ridge__alpha,params,mean_test_score,std_test_score,rank_test_score
5,0.162869,0.030336,10.0,{'ridge__alpha': 10},56464.231923,455.784461,1
4,0.152851,0.03258,1.0,{'ridge__alpha': 1},56464.234478,455.952778,2
2,0.152358,0.028897,0.1,{'ridge__alpha': 0.1},56464.235144,455.969668,3
1,0.164439,0.029428,0.01,{'ridge__alpha': 0.01},56464.235215,455.971357,4
0,0.184107,0.031259,0.001,{'ridge__alpha': 0.001},56464.235222,455.971526,5
3,0.160194,0.029816,0.0,{'ridge__alpha': 0},56464.235223,455.971545,6
6,0.154461,0.030451,100.0,{'ridge__alpha': 100},56464.60371,454.157089,7
7,0.155313,0.027861,1000.0,{'ridge__alpha': 1000},56497.956724,442.132535,8


In [42]:
ridge = Pipeline([('preprocessor',preprocessor), ('ridge',Ridge(alpha=10))])
ridge.fit(data_train,target_train)
predicted = ridge.predict(data_test)

mse = mean_squared_error(predicted,target_test)
print('MSE of Ridge Regression:', mse)

MSE of Ridge Regression: 56007.82104130895


In [43]:
lasso = ridge = Pipeline([('preprocessor',preprocessor), ('lasso',Lasso())])
param_grid = {
    'lasso__alpha' : [0.001,0.01,0.1,0,1,10,100,1000]
}
grid_search = GridSearchCV(lasso,param_grid= param_grid,cv=5,n_jobs=-1,scoring = 'neg_mean_squared_error')
grid_search.fit(data_train,target_train)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat_preprocessor',
                                                                         OneHotEncoder(),
                                                                         ['new_heatingType']),
                                                                        ('num_preprocessor',
                                                                         StandardScaler(),
                                                                         ['serviceCharge',
                                                                          'telekomUploadSpeed'])])),
                                       ('lasso', Lasso())]),
             n_jobs=-1,
             param_grid={'lasso__alpha': [0.001, 0.01, 0.1, 0, 1, 10, 100,
                                          1000]},
             scoring='neg_mean_squared_error')

In [44]:
cv_res_lasso = pd.DataFrame(grid_search.cv_results_)
cv_res_lasso = cv_res_lasso[['mean_fit_time','mean_score_time','param_lasso__alpha', 'params','mean_test_score', 'std_test_score',
       'rank_test_score']]
cv_res_lasso.loc[:,'mean_test_score'] = -cv_res_lasso['mean_test_score']
cv_res_lasso.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,param_lasso__alpha,params,mean_test_score,std_test_score,rank_test_score
0,8.662095,0.035639,0.001,{'lasso__alpha': 0.001},56464.23508,455.937331,1
3,7.859488,0.036539,0.0,{'lasso__alpha': 0},56464.235223,455.971545,2
1,6.535426,0.050592,0.01,{'lasso__alpha': 0.01},56464.245811,455.573732,3
2,1.001757,0.046152,0.1,{'lasso__alpha': 0.1},56465.499217,452.4835,4
4,0.299431,0.037512,1.0,{'lasso__alpha': 1},56554.185449,427.207615,5
5,0.207455,0.037382,10.0,{'lasso__alpha': 10},58985.136201,409.204724,6
6,0.182432,0.036785,100.0,{'lasso__alpha': 100},70523.161455,629.770646,7
7,0.149584,0.027895,1000.0,{'lasso__alpha': 1000},115019.34426,801.192882,8


In [45]:
lasso = Pipeline([('preprocessor',preprocessor), ('lasso',Lasso(alpha=0.001))])
lasso.fit(data_train,target_train)
predicted = lasso.predict(data_test)

mse = mean_squared_error(predicted,target_test)
print('MSE of Lasso Regression:', mse)

MSE of Lasso Regression: 56007.67331458834


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


# Feature selection using RFE method

In [46]:
data = dataset.drop('totalRent',axis=1)
target = dataset['totalRent']

data_train, data_test, target_train, target_test = train_test_split(data,target,test_size=0.2,random_state=42)
cat_selector = make_column_selector(dtype_include=['bool','object'])
num_selector = make_column_selector(dtype_exclude=['bool','object'])
cat_cols = cat_selector(data_train)
numeric_cols = num_selector(data_train)
preprocessor  = ColumnTransformer([('cat_preprocessor',OneHotEncoder(),cat_cols),
                                   ('num_preprocessor',StandardScaler(),numeric_cols)])

In [47]:
from sklearn.feature_selection import RFE
ridge = Ridge(alpha=10)
rfe = RFE(ridge,n_features_to_select=10)
data_train_tr = preprocessor.fit_transform(data_train)
rfe.fit(data_train_tr,target_train)


RFE(estimator=Ridge(alpha=10), n_features_to_select=10)

In [48]:
data_test_tr = preprocessor.transform(data_test)
predicted = rfe.predict(data_test_tr)
mse_rfe = mean_squared_error(predicted,target_test)

In [49]:
print('MSE of Ridge model with 10 selected features:',mse_rfe)

MSE of Ridge model with 10 selected features: 4390.583933806413


In [50]:
lasso = Lasso(alpha=0.001)
rfe_lasso = RFE(lasso ,n_features_to_select=10)
data_train_tr = preprocessor.fit_transform(data_train)
rfe_lasso.fit(data_train_tr,target_train)


RFE(estimator=Lasso(alpha=0.001), n_features_to_select=10)

In [51]:
data_test_tr = preprocessor.transform(data_test)
predicted = rfe_lasso.predict(data_test_tr)
mse_rfe_lasso = mean_squared_error(predicted,target_test)
print('MSE of Lasso model with 10 selected features:',mse_rfe_lasso)

MSE of Lasso model with 10 selected features: 4404.404262522767


#EXTRA

In [56]:
class LinearRegression_MAE():
  """
    Linear Regression model with mean absolute error cost function

    Parameters
    ----------

    learning_rate : float
                    learning rate of gradient descent algorithm
    n_iter : int
             number of iterations or epoch of gradient descent algorithm
    Attributes
    ----------
    lr : float
         learning rate of gradient descent algorithm
    n_iter : int
             number of iterations or epoch of gradient descent algorithm
    weights : numpy.array
              weights of our model are initialized with random numbers in [0,1]  
    bias : float
           bias of our model which is initialized with 0


    Methods
    -------
    fit(X,y)
        training the weights with regards to dataset (X and y)
    predict(X)
            predict target values corresponding to X
    _compute_gradient(X,y_true,y_predicted)
                      computing gradients of mae cost function

  """

  def __init__(self,learning_rate = 1,n_iter=5000):
    self.lr = learning_rate
    self.n_iter = n_iter
    self.weights = None
    self.bias = None
    
  #computing gradients of mae cost function
  def _compute_gradient(self,X,y_true,y_predicted):
    n = X.shape[0]
    signs = np.array(np.sign(y_predicted-y_true)).reshape(-1,1)
    dw = (1/n) * np.sum(X * signs, axis=0)
    
    db = (1/n) * np.sum(np.sign(y_predicted-y_true))
    
    return dw , db

  def predict(self,X):
    return np.dot(X,self.weights) + self.bias

  def fit(self,X,y):
    # initializing weights and bias
    self.weights = np.random.rand(X.shape[1])
    
    self.bias = 0 

    for _ in range(self.n_iter):

      y_predicted = self.predict(X)

      dw , db = self._compute_gradient(X,y,y_predicted)
      
      # updating weights and bias
      self.weights -= self.lr * dw
      self.bias -= self.lr * db
      # print(self.weights,self.bias)


In [53]:
mae_lr  = LinearRegression_MAE()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
data = dataset.loc[:,['serviceCharge',"telekomUploadSpeed",'new_heatingType']]
target = dataset['totalRent']

data_train, data_test, target_train, target_test = train_test_split(data,target,test_size=0.2,random_state=42)

train_cat_feature = pd.get_dummies(data_train[['new_heatingType']])
train_numeric_features = scaler.fit_transform(data_train[['serviceCharge',"telekomUploadSpeed"]])
train_numeric_features = pd.DataFrame(train_numeric_features,columns=['serviceCharge',"telekomUploadSpeed"],index= data_train.index)
data_train  = pd.concat([train_numeric_features,train_cat_feature],axis=1)
mae_lr.fit(data_train,target_train)

In [54]:
from sklearn.metrics import mean_absolute_error
test_cat_feature = pd.get_dummies(data_test[['new_heatingType']])
test_numeric_features = scaler.transform(data_test[['serviceCharge',"telekomUploadSpeed"]])
test_numeric_features = pd.DataFrame(test_numeric_features,columns=['serviceCharge',"telekomUploadSpeed"],index= data_test.index)
data_test  = pd.concat([test_numeric_features,test_cat_feature],axis=1)

predicted =mae_lr.predict(data_test)
mae = mean_absolute_error(predicted,target_test)
print('MAE of MY Linear Regression:',mae)

MAE of MY Linear Regression: 180.07788212291203
