## 2. MODEL APPLICATION

Here, we try different __Regression models__ (the variable to predict is continous instead of discrete) to test which one could return a more accurate prediction.

In [17]:
#Data management
import pandas as pd
import numpy as np

#Machine learning models
from sklearn.model_selection import train_test_split

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


from sklearn.feature_selection import RFECV
from sklearn.metrics import *


from sklearn import model_selection
from sklearn import ensemble

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

#### 2.1. Pre-training

In [10]:
clean_data = pd.read_csv("output/clean_data.csv")

In [11]:
X = clean_data.iloc[:,:-1]
y = clean_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### Model #1: SVR (Support Vector Regressor)

In [12]:
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

print("SVR RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

SVR RMSE: 4323.475209144236


### Model #2: Random forest Regressor

In [13]:
rf_reg = RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_leaf=3, random_state=111)
selector = RFECV(rf_reg, step=1, cv=5)

rf_reg.fit(X_train, y_train)

print('Random forest RMSE', np.sqrt(mean_squared_error(y_test, rf_reg.predict(X_test))))

Random forest RMSE 568.756089746702


### Model #3: Gradient Boosting Regressor

#### 2.2. Model fitting

In [14]:
params = {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 25, 'min_samples_leaf':10,
          'learning_rate': 0.01, 'loss': 'ls'}

clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=10, min_samples_split=25,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

#### 2.3. Error calculation

In [15]:
y_pred = clf.predict(X_test)
print("GBR RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

GBR RMSE: 543.172239550555


### Gradient Boosting Regressor turned out to be the best model out of these three that predicts the price of diamonds. Let's test it on kaggle ➡️ 🚀  [3. Submitting the dataset with the model applied](https://github.com/breogann/Project-5.Machine-learning-algorithm-selection/blob/master/3.%20Submitting%20the%20dataset%20with%20the%20model%20applied.ipynb)