In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,explained_variance_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# data inialize
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_pickle('/content/drive/My Drive/ass3.pickle')

train = data['train']
dev = data['dev']
test = data['test']


Mounted at /content/drive


In [3]:
# preliminary data analysis
for part, d in data.items():
    print(part, "set:\n")
    print(f"Number of samples: {train.shape[0]}")
    print(f"Number of features: {train.shape[1] -1}\n")
    print("HEAD:")
    print(d.head(),"\n") # prints first 5 rows of the data
    print("STATISTICS:")
    print(d.describe(),"\n") # prints descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max)
    print("IS NULL:")
    print(d.isnull().sum()) # checks if there are NaN or None in the columns
    print("-----------------------------\n")

train set:

Number of samples: 12384
Number of features: 8

HEAD:
           f0    f1        f2        f3      f4        f5     f6      f7  \
14981  4.0391  15.0  6.297710  0.992366   334.0  2.549618  32.72 -116.99   
6614   4.7241  46.0  5.375758  0.954545   753.0  2.281818  34.17 -118.10   
14233  3.3553   7.0  5.229213  1.101124  1304.0  2.930337  32.70 -117.01   
1802   1.3929  52.0  5.000000  0.953488   126.0  2.930233  37.92 -122.36   
6030   1.6006  52.0  4.427083  1.017361  1246.0  2.163194  34.07 -117.75   

       target  
14981   1.695  
6614    2.796  
14233   1.322  
1802    1.042  
6030    1.462   

STATISTICS:
                 f0            f1            f2            f3            f4  \
count  12210.000000  12244.000000  12226.000000  12228.000000  12215.000000   
mean       3.872771     28.630595      5.420978      1.096626   1426.830618   
std        1.919183     12.566127      2.382548      0.471398   1103.528284   
min        0.499900      1.000000      0.846154    

In [4]:
# Split the data to X and y
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_dev = dev.iloc[:, :-1]
y_dev = dev.iloc[:, -1]
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

# **preprocessing**

---



In [5]:
# Imputation for NULL values
X_train_imputed = X_train.copy()
for col in X_train_imputed.columns:
    if (X_train_imputed[col].isnull().sum() > 0):
       imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
       imputer = imputer.fit(X_train_imputed[[col]])
       X_train_imputed[col] = imputer.transform(X_train_imputed[[col]])

# Standardization
scaler = MinMaxScaler()
scaler.fit(X_train_imputed) # fit scaler using only train data to avoid data leakage

X_train_scaled = scaler.transform(X_train_imputed)
X_dev_scaled = scaler.transform(X_dev)
X_test_scaled = scaler.transform(X_test)


# **Decision Tree Regressor**

---
### **hyperparameter search**

**'criterion'** - which function to use to measure the impurity of a split.  
**'splitter'** - which strategy to split each node. best - best split, random - the best random split. gives randomness into process.  
**'max_depth'** - maximum depth of the tree.  
**'min_samples_split**' - minimum number of samples required to split an internal node.  
**'min_samples_leaf'** - minimum number of samples required to be at a leaf node.  



In [6]:
# both models are after Imputation - DecisionTreeRegressor can't run with null values
model_default = DecisionTreeRegressor()
model_default.fit(X_train_imputed, y_train)
y_pred = model_default.predict(X_dev)
mse = mean_squared_error(y_dev, y_pred)

print("Dev mse before hyperparameter search:")
print(f'Without Standardization : {mse}')

model_default = DecisionTreeRegressor()
model_default.fit(X_train_scaled, y_train)
y_pred = model_default.predict(X_dev_scaled)
mse_scaled = mean_squared_error(y_dev, y_pred)
print(f'With Standardization : {mse_scaled}')

""" ---------------------------------------------------------------------------------------------------"""
# hyperparameter search

if (mse_scaled < mse):
  X_train_grid = X_train_scaled.copy()
  x_dev_grid = X_dev_scaled.copy()
  print("Using Standardization made the performance better\n")
else:
  X_train_grid = X_train_scaled.copy()
  x_dev_grid = X_dev_scaled.copy()
  print("Using Standardization made the performance worse\n")


model = DecisionTreeRegressor()
params = {
    'criterion': ['squared_error', 'friedman_mse'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
}
grid_search = GridSearchCV(estimator=model, param_grid=params,n_jobs=-1)
grid_search.fit(X_train_grid, y_train)
print(f"The best Parameters:{grid_search.best_params_}\n")
grid_model = grid_search.best_estimator_
y_pred = grid_model.predict(x_dev_grid)

# Evaluate model
mse = mean_squared_error(y_dev, y_pred)
mae = mean_absolute_error(y_dev, y_pred)
r2 = r2_score(y_dev, y_pred)
evs = explained_variance_score(y_dev, y_pred)

print("Metrics evaluate after hyperparameter search :")
print(f"mse: {mse}")
print(f"mae: {mae}")
print(f"r2: {r2}")
print(f"evs: {evs}")


Dev mse before hyperparameter search:
Without Standardization : 0.5865978097325582
With Standardization : 0.5906748645632509
Using Standardization made the performance worse

The best Parameters:{'criterion': 'squared_error', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'best'}

Metrics evaluate after hyperparameter search :
mse: 0.42858973803745487
mae: 0.4428844099098646
r2: 0.6728364477937049
evs: 0.6728742270482152


# Linear Regression

In [9]:
print("Linear Regression")

model = LinearRegression()
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_dev)

mse = mean_squared_error(y_dev, y_pred)

print("Dev mse before hyperparameter search:")
print(f'Without Standardization : {mse}')

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_dev_scaled)

mse_scaled = mean_squared_error(y_dev, y_pred)
print(f'With Standardization : {mse_scaled}')

if (mse_scaled < mse):
  X_train_grid = X_train_scaled.copy()
  x_dev_grid = X_dev_scaled.copy()
  print("Using Standardization made the performance better\n")
else:
  X_train_grid = X_train_imputed.copy()
  x_dev_grid = X_dev.copy()
  print("Using Standardization made the performance worse\n")

mse = mean_squared_error(y_dev, y_pred)
mae = mean_absolute_error(y_dev, y_pred)
r2 = r2_score(y_dev, y_pred)
evs = explained_variance_score(y_dev, y_pred)

print("Metrics evaluate after hyperparameter search :")
print(f"mse: {mse}")
print(f"mae: {mae}")
print(f"r2: {r2}")
print(f"evs: {evs}")

Linear Regression
Dev mse before hyperparameter search:
Without Standardization : 0.5229043349016068
With Standardization : 0.5229043349016071
Using Standardization made the performance worse

Metrics evaluate after hyperparameter search :
mse: 0.5229043349016071
mae: 0.5231571750489639
r2: 0.6008414936534725
evs: 0.6008586157921586


# SVR

In [10]:
print("SVR")

model = SVR()
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_dev)

mse = mean_squared_error(y_dev, y_pred)

print("Dev MSE before hyperparameter search:")
print(f'Without Standardization: {mse}')

model = SVR()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_dev_scaled)

mse_scaled = mean_squared_error(y_dev, y_pred)
print(f'With Standardization: {mse_scaled}')

if mse_scaled < mse:
    X_train_grid = X_train_scaled.copy()
    X_dev_grid = X_dev_scaled.copy()
    print("Using Standardization made the performance better\n")
else:
    X_train_grid = X_train_imputed.copy()
    X_dev_grid = X_dev.copy()
    print("Using Standardization made the performance worse\n")

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(SVR(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_grid, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

best_model = grid_search.best_estimator_

y_dev_pred = best_model.predict(X_dev_grid)
mse = mean_squared_error(y_dev, y_dev_pred)
mae = mean_absolute_error(y_dev, y_dev_pred)
r2 = r2_score(y_dev, y_dev_pred)
evs = explained_variance_score(y_dev, y_dev_pred)

print("Metrics evaluate after hyperparameter search:")
print(f"mse: {mse}")
print(f"mae: {mae}")
print(f"r2: {r2}")
print(f"evs: {evs}")

y_test_pred = best_model.predict(X_test_scaled)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
evs_test = explained_variance_score(y_test, y_test_pred)

print("Final evaluation on the test set:")
print(f"mse: {mse_test}")
print(f"mae: {mae_test}")
print(f"r2: {r2_test}")
print(f"evs: {evs_test}")

SVR
Dev MSE before hyperparameter search:
Without Standardization: 1.351774899648952
With Standardization: 0.4728811519237644
Using Standardization made the performance better

Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Metrics evaluate after hyperparameter search:
mse: 0.4401560430608194
mae: 0.4406136707664648
r2: 0.6640073203053216
evs: 0.6753909057952072
Final evaluation on the test set:
mse: 0.4366549802026933
mae: 0.4418450137195807
r2: 0.6869719943721015
evs: 0.6983829328518527


In [11]:
def evaluate_model(model, param_grid, model_name):
    print(model_name)

    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_dev)
    mse = mean_squared_error(y_dev, y_pred)
    print(f"Dev MSE before hyperparameter search without Standardization: {mse}")

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_dev_scaled)
    mse_scaled = mean_squared_error(y_dev, y_pred)
    print(f"Dev MSE before hyperparameter search with Standardization: {mse_scaled}")

    if mse_scaled < mse:
        X_train_grid = X_train_scaled.copy()
        X_dev_grid = X_dev_scaled.copy()
        print("Using Standardization made the performance better\n")
    else:
        X_train_grid = X_train_imputed.copy()
        X_dev_grid = X_dev.copy()
        print("Using Standardization made the performance worse\n")

    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train_grid, y_train)
    print(f"Best parameters found for {model_name}: {grid_search.best_params_}")

    # Evaluate the best model on the dev set
    best_model = grid_search.best_estimator_
    y_dev_pred = best_model.predict(X_dev_grid)
    mse = mean_squared_error(y_dev, y_dev_pred)
    mae = mean_absolute_error(y_dev, y_dev_pred)
    r2 = r2_score(y_dev, y_dev_pred)
    evs = explained_variance_score(y_dev, y_dev_pred)

    print(f"Metrics evaluated after hyperparameter search for {model_name}:")
    print(f"mse: {mse}")
    print(f"mae: {mae}")
    print(f"r2: {r2}")
    print(f"evs: {evs}")

    y_test_pred = best_model.predict(X_test_scaled)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    evs_test = explained_variance_score(y_test, y_test_pred)

    print(f"Final evaluation on the test set for {model_name}:")
    print(f"mse: {mse_test}")
    print(f"mae: {mae_test}")
    print(f"r2: {r2_test}")
    print(f"evs: {evs_test}\n")


In [12]:
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

evaluate_model(AdaBoostRegressor(), param_grid_adaboost, "AdaBoost")

AdaBoost
Dev MSE before hyperparameter search without Standardization: 0.6423567302346261
Dev MSE before hyperparameter search with Standardization: 0.7140307081710622
Using Standardization made the performance worse

Best parameters found for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 50}
Metrics evaluated after hyperparameter search for AdaBoost:
mse: 0.562585808051947
mae: 0.5855966469444533
r2: 0.5705506804107406
evs: 0.5822001431844692
Final evaluation on the test set for AdaBoost:
mse: 1.4324266232792904
mae: 0.9819416759091022
r2: -0.02687400676222884
evs: 0.0





In [13]:
param_grid_gradientboosting = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7]
}

evaluate_model(GradientBoostingRegressor(), param_grid_gradientboosting, "GradientBoosting")


GradientBoosting
Dev MSE before hyperparameter search without Standardization: 0.29576214702115255
Dev MSE before hyperparameter search with Standardization: 0.2993886071180408
Using Standardization made the performance worse

Best parameters found for GradientBoosting: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Metrics evaluated after hyperparameter search for GradientBoosting:
mse: 0.22885056351991087
mae: 0.31168939073898055
r2: 0.8253071489102165
evs: 0.8253456224339
Final evaluation on the test set for GradientBoosting:
mse: 1.6177562061542734
mae: 1.093863752279205
r2: -0.1597325617803731
evs: 0.004603967012443144





In [14]:
param_grid_xgboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

evaluate_model(XGBRegressor(objective='reg:squarederror'), param_grid_xgboost, "XGBoost")

XGBoost
Dev MSE before hyperparameter search without Standardization: 0.23387224399644888
Dev MSE before hyperparameter search with Standardization: 0.23387224399644888
Using Standardization made the performance worse

Best parameters found for XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Metrics evaluated after hyperparameter search for XGBoost:
mse: 0.21462622481063584
mae: 0.3026530530052014
r2: 0.8361652837811574
evs: 0.836166190593622
Final evaluation on the test set for XGBoost:
mse: 1.608914983314919
mae: 0.9111525648254386
r2: -0.1533944905841389
evs: -0.0005290625524645431



The best estimator was XGBoost, The results :

Final evaluation on the test set for XGBoost:

mse: 1.608914983314919

mae: 0.9111525648254386

r2: -0.1533944905841389

evs: -0.0005290625524645431