## 0. Utilities

In [45]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 

def NullClearner(df):
    if(isinstance(df, pd.Series) and (df.dtype in ["float64","int64"])):
        df.fillna(df.mean(),inplace=True)
        return df
    elif(isinstance(df, pd.Series)):
        df.fillna(df.mode()[0],inplace=True)
        return df
    else:return df
    
    
def EncodeX(df):
    return pd.get_dummies(df)

def EncodeY(df):
    if len(df.unique())<=2:
        return df
    else:
        un_EncodedT=np.sort(pd.unique(df), axis=-1, kind='mergesort')
        df=LabelEncoder().fit_transform(df)
        EncodedT=[xi for xi in range(len(un_EncodedT))]
        print("Encoded Target: {} to {}".format(un_EncodedT,EncodedT))
        return df

## 1. Linear Regression

In [46]:
import warnings
import numpy as np 
import pandas as pd 
import seaborn as se 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')


def linearregression(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=LinearRegression()
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model

## 2. Decision Tree Regressor

In [47]:
import warnings
import numpy as np
import pandas as pd
import seaborn as se
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor,plot_tree
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
warnings.filterwarnings('ignore')



def DTRegressor(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=DecisionTreeRegressor(random_state=123)
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model

## 3. GradientboostingRegressor

In [48]:
import warnings
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as se 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
warnings.filterwarnings('ignore')



def GBRegressor(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=GradientBoostingRegressor(random_state=123)
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model

## 4. KNeighbors regression

In [49]:
import warnings 
import numpy as np 
import pandas as pd
import seaborn as se  
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
from sklearn.neighbors import KNeighborsRegressor
warnings.filterwarnings('ignore')



def KNRegressor(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=KNeighborsRegressor(n_jobs=-1)
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model

## 5. LassoRegressor

In [50]:
import warnings
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as se 
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
warnings.filterwarnings('ignore')



def LassoRegressor(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=Lasso(random_state=123)
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model

## 6. Random Forest Regressor

In [51]:
import warnings 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as se 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
warnings.filterwarnings('ignore')

def RFRegressor(path,features,target):
    
    # -- Data Preprocessing --
    df=pd.read_csv(path)
    X=df[features]
    Y=df[target]

    # -- Calling preprocessing functions on the feature and target set. --
    x=X.columns.to_list()
    for i in x:
        X[i]=NullClearner(X[i])
    X=EncodeX(X)
    Y=NullClearner(Y)

    # -- Splitting the dataset into the Training set and Test set --
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)

    # -- Fitting Linear Regression to the Training set --
    model=RandomForestRegressor(random_state=123)
    model.fit(x_train,y_train)

    # -- Predicting the Test set results --
    y_pred=model.predict(x_test)

    # -- evalution metric between (y_test,y_pred) --
    accuracy = model.score(x_test,y_test)*100    
    r2 =r2_score(y_test,y_pred)*100   #r2_score
    score2 = mean_absolute_error(y_test,y_pred) #r2_score
    score3 = mean_squared_error(y_test,y_pred)

    return accuracy, r2, score2, score3, model


## 7. Testing

In [55]:
path="../../4. Test Data/Regression/House prices/House prices.csv"
features=['MSSubClass','MSZoning','LotFrontage','LotArea','Street','Neighborhood','HouseStyle','YearBuilt','RoofStyle','Foundation','BedroomAbvGr','Functional','Fireplaces','GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond','PoolArea','PoolQC']
target='SalePrice'
accuracy, r2, score2, score3, model = linearregression(path,features,target)
print('linear regression :\n\t\t\t',r2)
accuracy, r2, score2, score3, model = DTRegressor(path,features,target)
print('Decision tree regressor :\n\t\t\t',r2)
accuracy, r2, score2, score3, model = GBRegressor(path,features,target)
print('Gradient boosting regressor :\n\t\t\t',r2)
accuracy, r2, score2, score3, model = LassoRegressor(path,features,target)
print('lasso regressor :\n\t\t\t',r2)
accuracy, r2, score2, score3, model = KNRegressor(path,features,target)
print('kNeighbors regressor :\n\t\t\t',r2)
accuracy, r2, score2, score3, model = RFRegressor(path,features,target)
print('Randomgorest :\n\t\t\t',r2)

linear regression :
			 71.62702092769365
Decision tree regressor :
			 51.08622864836636
Gradient boosting regressor :
			 79.2222859112197
lasso regressor :
			 71.52068781576462
kNeighbors regressor :
			 34.86393145924204
Randomgorest :
			 75.60427963737598
