## 2. MODEL APPLICATION

Here, we try different __Regression models__ (the variable to predict is continous instead of discrete) to test which one could return a more accurate prediction.

In [1]:
#Data management
import pandas as pd
import numpy as np
import joblib

#Machine learning models
from sklearn.model_selection import train_test_split

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


from sklearn.feature_selection import RFECV
from sklearn.metrics import *


from sklearn import model_selection
from sklearn import ensemble

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

#### 2.1. Pre-training

In [2]:
clean_diamonds_train = pd.read_csv("output/clean_diamonds_train.csv")

In [3]:
X = clean_diamonds_train.iloc[:,:-1]
y = clean_diamonds_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### Model #1: SVR (Support Vector Regressor)

__1.1. Model fitting & prediction__

In [4]:
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

__1.2. Error calculation__

In [5]:
print("SVR RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

SVR RMSE: 4274.1017619004


### Model #2: Random forest Regressor

__2.1. Model fitting__

In [6]:
rf_reg = RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_leaf=3, random_state=111)
selector = RFECV(rf_reg, step=1, cv=5)

rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=111, verbose=0, warm_start=False)

__2.2. Error calculation__

In [7]:
print('Random forest RMSE:', np.sqrt(mean_squared_error(y_test, rf_reg.predict(X_test))))

Random forest RMSE: 565.6761073123371


### Model #3: Gradient Boosting Regressor

#### 3.1. Model fitting  & prediction

In [8]:
params = {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 25, 'min_samples_leaf':10,
          'learning_rate': 0.01, 'loss': 'ls'}

clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#### 3.2. Error calculation

In [9]:
print("GBR RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

GBR RMSE: 546.1997894095379


### The lower the error, the better the model, so the best one is __Gradient boosting Regressor__, so we export it to use it in the test dataset:

In [10]:
joblib.dump(clf, 'output/trained_models/GBR.sav') #Saving the model

['output/trained_models/GBR.sav']

### Gradient Boosting Regressor turned out to be the best model out of these three that predicts the price of diamonds. Let's test it on kaggle ➡️ 🚀 [3. Submitting the dataset with the model applied](https://github.com/breogann/Project-5.Machine-learning-algorithm-selection/blob/master/3.%20Submitting%20the%20dataset%20with%20the%20model%20applied.ipynb)