In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,Feature11,Feature12,Feature13,Feature14,Target
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,120000.0
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,550000.0
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,215000.0
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022,226000.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022,570000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022,250000.0
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444,925000.0
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022,425000.0
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444,1225000.0


In [3]:
# Seperate dependent and independent features
X = df.drop('Target', axis=1)
y = df['Target']

In [4]:
# Train test split the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((11558, 14), (3853, 14), (11558,), (3853,))

In [6]:
# Training different models to check which performs well
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [10]:
# Function to evaluate model
def model_evaluate(true , predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    r2score = r2_score(true, predicted)
    rmse = np.sqrt(mse)
    return mae, r2score, rmse

In [13]:
# model training
models = {
    'LinearRegression' : LinearRegression(),
    'LassoRegression' : Lasso(),
    'RidgeRegression' : Ridge(),
    'KNeighborsRegressor' : KNeighborsRegressor(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'RandomForestRegressor' : RandomForestRegressor(),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)   # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate the model
    model_train_mae, model_train_r2score, model_train_rmse = model_evaluate(y_train, y_train_pred)

    model_test_mae, model_test_r2score, model_test_rmse = model_evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])

    print('Model Performance For Training Set')
    print('- Root Mean Squared Error : {:.4f}'.format(model_train_rmse))
    print('- Mean Absolute Error : {:.4f}'.format(model_train_mae))
    print('- R2 Score : {:.4f}'.format(model_train_r2score))

    print('-----------------------------------------')

    print('Model Performance For Test Set')
    print('- Root Mean Squared Error : {:.4f}'.format(model_test_rmse))
    print('- Mean Absolute Error : {:.4f}'.format(model_test_mae))
    print('- R2 Score : {:.4f}'.format(model_test_r2score))

    print('='*35)
    print('\n')

LinearRegression
Model Performance For Training Set
- Root Mean Squared Error : 552154.2495
- Mean Absolute Error : 266675.1076
- R2 Score : 0.6220
-----------------------------------------
Model Performance For Test Set
- Root Mean Squared Error : 519891.2635
- Mean Absolute Error : 284283.4460
- R2 Score : 0.6525


LassoRegression
Model Performance For Training Set
- Root Mean Squared Error : 552154.2607
- Mean Absolute Error : 266674.0472
- R2 Score : 0.6220
-----------------------------------------
Model Performance For Test Set
- Root Mean Squared Error : 519890.5730
- Mean Absolute Error : 284283.7890
- R2 Score : 0.6525


RidgeRegression
Model Performance For Training Set
- Root Mean Squared Error : 552154.8775
- Mean Absolute Error : 266635.3662
- R2 Score : 0.6220
-----------------------------------------
Model Performance For Test Set
- Root Mean Squared Error : 519880.3842
- Mean Absolute Error : 284241.1129
- R2 Score : 0.6525


KNeighborsRegressor
Model Performance For Tra

In [None]:
# Initialize parameters for hyperparameter tuning
# Knn and random forest show better results than the others 
knn_params = {'n_neighbors' : [2,3,10,20,40,50]}
rfft_params = {
    'max_depth' : [5,8,15,None,10],
    'max_features' : [5,7,'auto',8],
    'min_samples_split' : [2,8,15,20],
    'n_estimators' : [100,200,500,1000]
}

In [15]:
# Model list for hyperparameter tuning
randomcv_model = [
    ('KNN',KNeighborsRegressor(),knn_params),
    ('Random Forest',RandomForestRegressor(), rfft_params),
]

In [17]:
# Hyperparameter tuning 
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_model:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                cv=3, n_iter=100, n_jobs=-1, verbose=2)
    random.fit(X_train,y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f'---------------- Best Parameters for {model_name} ------------------------')
    print(model_param[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Parameters for KNN ------------------------
{'n_neighbors': 10}
---------------- Best Parameters for Random Forest ------------------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 8, 'max_depth': None}


In [18]:
models = {
    'KNeighborsRegressor' : KNeighborsRegressor(n_neighbors=10),
    'RandomForestRegressor' : RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=8, max_depth=None),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)   # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate the model
    model_train_mae, model_train_r2score, model_train_rmse = model_evaluate(y_train, y_train_pred)

    model_test_mae, model_test_r2score, model_test_rmse = model_evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])

    print('Model Performance For Training Set')
    print('- Root Mean Squared Error : {:.4f}'.format(model_train_rmse))
    print('- Mean Absolute Error : {:.4f}'.format(model_train_mae))
    print('- R2 Score : {:.4f}'.format(model_train_r2score))

    print('-----------------------------------------')

    print('Model Performance For Test Set')
    print('- Root Mean Squared Error : {:.4f}'.format(model_test_rmse))
    print('- Mean Absolute Error : {:.4f}'.format(model_test_mae))
    print('- R2 Score : {:.4f}'.format(model_test_r2score))

    print('='*35)
    print('\n')

KNeighborsRegressor
Model Performance For Training Set
- Root Mean Squared Error : 365573.3616
- Mean Absolute Error : 102934.2620
- R2 Score : 0.8343
-----------------------------------------
Model Performance For Test Set
- Root Mean Squared Error : 296634.6850
- Mean Absolute Error : 121769.3550
- R2 Score : 0.8869


RandomForestRegressor
Model Performance For Training Set
- Root Mean Squared Error : 131882.4585
- Mean Absolute Error : 39232.5629
- R2 Score : 0.9784
-----------------------------------------
Model Performance For Test Set
- Root Mean Squared Error : 233431.6973
- Mean Absolute Error : 102541.8243
- R2 Score : 0.9299


