In [73]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForMaskedLM
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [74]:
real_states = pd.read_csv("/kaggle/input/apartment-rental-offers-in-germany/immo_data.csv")
real_states

In [75]:
real_states.info()

## Data Cleaning

In [76]:
#including needed columns for our model
real_states = real_states[['serviceCharge', 'heatingType', 'telekomUploadSpeed', 'totalRent']]
real_states

In [77]:
real_states.isnull().sum()

In [78]:
#replacing null values of serviceCharge column with median
real_states.serviceCharge.fillna(real_states.serviceCharge.median(), inplace=True)
#replacing null values of heatingType column which is non-numeric or categorical with mode
real_states.heatingType.fillna(real_states.heatingType.mode()[0], inplace=True)
#replacing null values of telekomUplaodSpeed with median
real_states.telekomUploadSpeed.fillna(real_states.telekomUploadSpeed.median(), inplace=True)
#replacing null values of totalRent column with median
real_states.totalRent.fillna(real_states.totalRent.median(), inplace=True)

In [79]:
real_states.isnull().sum()

Outliers

In [80]:
plt.figure()
sns.boxplot(data=real_states, x='serviceCharge')

In [81]:
sns.boxplot(data=real_states, x='telekomUploadSpeed')

In [82]:
sns.boxplot(data=real_states, x='totalRent')

Removing Outliers

In [83]:
for col in real_states.columns:
    if real_states[col].dtype == 'float64':
        #Calculate the first and third quartile (Q1 and Q3).
        Q1 = real_states[col].quantile(0.25)
        Q3 = real_states[col].quantile(0.75)
        #Calculate the interquartile range
        IQR = Q3 - Q1
        #Estimate the lower bound, the lower bound
        lower_range = Q1 - 1.5 * IQR
        #Estimate the upper bound, upper bound
        upper_range = Q3 + 1.5 * IQR
        
        indexs = real_states[(real_states[col] > upper_range) | (real_states[col] < lower_range)].index
        real_states.drop(indexs, inplace=True)
real_states.shape

splitting dataset into training and test

In [84]:
real_states

In [85]:
y = real_states.pop('totalRent')
X = real_states

In [86]:
y

In [87]:
X

In [88]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, train_size=0.8)

In [89]:
y_train = pd.DataFrame(data=y_train, columns=['totalRent'])
y_test = pd.DataFrame(data=y_test, columns=['totalRent'])
y_train

In [90]:
y_test

In [91]:
X_test

In [92]:
X_train

Preprocessing & Scaling

In [93]:
OHE = OneHotEncoder(handle_unknown='ignore', sparse=False)
sScaler = StandardScaler()

In [94]:
preprocessor_x = ColumnTransformer(
    transformers=[
            ('serviceCharge', sScaler, ['serviceCharge']),
            ('telekomUploadSpeed', sScaler, ['telekomUploadSpeed']),
            ('heatingType', OHE, ['heatingType']),       
])
preprocessor_y = ColumnTransformer(
    transformers=[
            ('totalRent', sScaler, ['totalRent']),       
])

In [95]:
fitter_y = preprocessor_y.fit(y_train)
fitter_x = preprocessor_x.fit(X_train)

In [96]:
y_train = fitter_y.transform(y_train)
y_test = fitter_y.transform(y_test)
X_train = fitter_x.transform(X_train)
X_test = fitter_x.transform(X_test)

In [97]:
m, n = y_train.shape
x, z = y_test.shape
y_train = y_train.reshape(m, )
y_test = y_test.reshape(x, )

In [101]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Now data is ready for applying our models on them

Linear Regression - MSE (from scratch)

In [102]:
class LinearRegression() :
      
    def __init__( self, learning_rate, iterations ) :
          
        self.learning_rate = learning_rate
          
        self.iterations = iterations
          
    # Function for model training
              
    def fit( self, X, Y ) :
          
        # no_of_training_examples, no_of_features
          
        self.m, self.n = X.shape
          
        # weight initialization
          
        self.W = np.zeros( self.n )
          
        self.b = 0
          
        self.X = X
          
        self.Y = Y
          
          
        # gradient descent learning
                  
        for i in range( self.iterations ) :
              
            self.update_weights()
              
        return self
      
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :
             
        Y_pred = self.predict( self.X )
          
        # calculate gradients  
      
        dW = - ( 2 * ( self.X.T ).dot( self.Y - Y_pred )  ) / self.m
       
        db = - 2 * np.sum( self.Y - Y_pred ) / self.m 
          
        # update weights
      
        self.W = self.W - self.learning_rate * dW
      
        self.b = self.b - self.learning_rate * db
          
        return self
      
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :
      
        return X.dot( self.W ) + self.b

In [105]:
# Model training
model = LinearRegression( iterations = 1000, learning_rate = 0.01 )
model.fit( X_train, y_train )

In [106]:
# Prediction on test set
Y_pred = model.predict( X_test )

In [107]:
# Prediction on test set
Y_prediction = model.predict( X_test )
print( "Predicted values ", np.round( Y_prediction[:3], 2 ) ) 
print( "Real values      ", y_test[:3] )
print( "Trained W        ", round( model.W[0], 2 ) )
print( "Trained b        ", round( model.b, 2 ) )

In [115]:
from sklearn.linear_model import LinearRegression
lReg = LinearRegression().fit(X_train, y_train)
lReg.score(X_test, y_test)

Lasso

In [127]:
reg_lasso = Lasso(alpha=1.0)
reg_lasso.fit(X_train, y_train)
reg_lasso.score(X_test, y_test)

In [128]:
y_pred_test_lasso = reg_lasso.predict(X_test)
reg_lasso.coef_

In [122]:
reg_lasso.score(X_train, y_train)

Ridge

In [125]:
reg_ridge = Ridge(alpha=1.0)
reg_ridge.fit(X_train, y_train)
reg_ridge.score(X_test, y_test)