# DSL Winter Project - ExtraTreesRegressor

To view the code for the graphs presented in the report, please refer to the other file "RandomForest_and_GraphCodes".

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

In [None]:
# setting the random state
rs = 42

In [None]:
df_full = pd.read_csv("development.csv")
df_full.shape

## Preprocessing

#### Domain constraints

In [None]:
for i in range(18):
    mask = df_full[f'negpmax[{i}]']>0
    df_full = df_full.loc[~mask]

df_full.shape

#### Noise features

To understand how we detected the noise features, please refer to the other file "RandomForest_and_GraphCodes".

In [None]:
noise_columns = [f"pmax[{i}]" for i in [0, 7, 12, 15, 16, 17]] + \
                [f"negpmax[{i}]" for i in [0, 7, 12, 15, 16, 17]] + \
                [f"area[{i}]" for i in [0, 7, 12, 15, 16, 17]] + \
                [f"tmax[{i}]" for i in [0, 7, 12, 15, 16, 17]] + \
                [f"rms[{i}]" for i in [0, 7, 12, 15, 16, 17]]
noise_columns
df_nonoise = df_full.drop(columns=noise_columns)

In [None]:
df = df_nonoise
df.shape

In [None]:
# dividing df in X (inputs) and y (target variables)
y = df.loc[:,["x", "y"]]
X = df.iloc[:,2:]

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, stratify=y, random_state=rs)

In [None]:
# definition of the evaluation metric
def euclidean_metric(y_true, y_pred):
    return np.mean(np.sqrt(np.sum((y_true-y_pred)**2, axis=1)))

#### Features' importances

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

# model for feature extraction
etr0 = ExtraTreesRegressor(n_estimators=100, max_features=1.0, random_state=rs, n_jobs=-1, verbose=5)
etr0.fit(X_train, y_train)
print(f"r2: {r2_score(y_test, etr0.predict(X_test))} avg_euclidean:{euclidean_metric(y_test, etr0.predict(X_test))}")

In [None]:
feature_names = X.columns
sorted(zip(feature_names, etr0.feature_importances_), key=lambda x: x[1], reverse=True)
# It seems that rms and tmax are not important features

In [None]:
plt.figure(figsize=(12,6))
feature_importance = pd.Series(etr0.feature_importances_, index = feature_names)
feature_importance_sorted = feature_importance.sort_values(ascending=False)
feature_importance_sorted.plot(kind='bar')
#plt.axhline(y=1e-03, color='r', linestyle='--', label='Threshold at 1e-03')
plt.savefig('feature_importance2.png')

In [None]:
r = [f"rms[{i}]" for i in range(18) if i not in [0, 7, 12, 15, 16, 17]]
t = [f"tmax[{i}]" for i in range(18) if i not in [0, 7, 12, 15, 16, 17]]
c = r+t
# Remove rms and tmax features
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns=c), y, test_size=0.20, stratify=y, shuffle=True, random_state=0)
X_train
features_used = X_train.columns

## Validation

### ETR - GridSearch

In [None]:
import time
from datetime import datetime
from sklearn.metrics import make_scorer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

params_etr = {'n_estimators': [100, 200, 300],
                  'max_features': ["sqrt", 0.33, 0.5],
                  'criterion': ["squared_error", "poisson"],
                  'n_jobs': [-1],
                  'max_depth': [None, 10, 30, 50],
                  'random_state': [rs]
            }


print(f"start: {datetime.now()}")
gs_etr = GridSearchCV(ExtraTreesRegressor(), params_etr, scoring=make_scorer(euclidean_metric, greater_is_better=False),
                          n_jobs=-1, cv=3, verbose=2, error_score='raise')
gs_etr.fit(X_train, y_train)
print(f"end: {datetime.now()}")

In [None]:
gs_etr.best_params_

{'criterion': 'squared_error', 'max_depth': None, 'max_features': 0.5, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 42}

In [None]:
gs_etr.best_score_

In [None]:
print(f"r2: {r2_score(y_test, gs_etr.predict(X_test))} avg_euclidean:{euclidean_metric(y_test, gs_etr.predict(X_test))}")

## Results

### Result on test set

In [None]:
from sklearn.metrics import r2_score

etr1 = ExtraTreesRegressor(n_estimators=300, max_features=0.5, criterion = 'squared_error', max_depth = None, random_state=rs, n_jobs=-1) #---> optimal configuration
etr1.fit(X_train, y_train)
print(f"r2: {r2_score(y_test, etr1.predict(X_test))} avg_euclidean:{euclidean_metric(y_test, etr1.predict(X_test))}") #---> result on test set

r2: 0.999191109880434 avg_euclidean: 3.9232207897958213

### Conclusions

**Final ExtraTreesRegressor**

In [None]:
df_eval = pd.read_csv("evaluation.csv")
df_eval = df_eval[features_used] # drop noise, tmax, rms

In [None]:
y_full = df_full.loc[:,['x','y']]
X_full = df_full.iloc[:,2:]

X_full.drop(columns=noise_columns+c, inplace=True) # drop noise, tmax, rms

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score

final_etr = ExtraTreesRegressor(n_estimators=300, max_features=0.5, criterion = 'squared_error', max_depth = None, random_state=rs, n_jobs=-1) #---> optimal configuration

print(f"start fitting: {datetime.now()}")
final_etr.fit(X_full, y_full)
print(f"end fitting: {datetime.now()}")

print(f"start predicting: {datetime.now()}")
predictions = final_etr.predict(df_eval)
print(f"end predicting: {datetime.now()}")

data = {'Id': np.arange(0, predictions.shape[0]), 'Predicted': [f"{val[0]}|{val[1]}" for val in predictions]}
submission = pd.DataFrame(data)
submission.to_csv("output_etrFINAL.csv", index=False)