# Training the Model

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

# Lets import the clean data:

In [31]:
df = pd.read_csv("DATA/clean_data.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price
0,0,1.14,5,4,9,61.0,56.0,9013
1,1,0.76,5,3,7,62.7,57.0,2692
2,2,0.84,5,4,8,61.4,56.0,4372
3,3,1.55,5,3,8,62.0,57.0,13665
4,4,0.3,5,4,5,61.9,57.0,422


# First, we split the df for train and test:

In [32]:
y=df["price"]
X=df.drop(columns=["price","id"])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [33]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.14,5,4,9,61.0,56.0
1,0.76,5,3,7,62.7,57.0
2,0.84,5,4,8,61.4,56.0
3,1.55,5,3,8,62.0,57.0
4,0.3,5,4,5,61.9,57.0


In [34]:
y.head()

0     9013
1     2692
2     4372
3    13665
4      422
Name: price, dtype: int64

In [35]:
X.shape

(40455, 6)

In [36]:
y.shape

(40455,)

# First we make a simple Logistic Regression:

In [65]:
log = LinearRegression()

In [66]:
log.fit(X_train,y_train)

LinearRegression()

In [67]:
y_pred = log.predict(X_test)

In [68]:
print("MSE: ", mean_squared_error(y_test,y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("R2: ", r2_score(y_test,y_pred))

MSE:  1539466.868557476
RMSE:  1240.7525412254759
R2:  0.9048837282871169


# Now lets try a Decission Tree:

In [69]:
for i in range (1,20):
    tree = DecisionTreeRegressor(max_depth=i)
    tree.fit(X_train,y_train)
    y_pred = tree.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 1  -->  RMSE:  2556.2418321272166
Max Depth = 2  -->  RMSE:  1692.594825562079
Max Depth = 3  -->  RMSE:  1377.3895045281097
Max Depth = 4  -->  RMSE:  1173.179041558432
Max Depth = 5  -->  RMSE:  1012.8938469309877
Max Depth = 6  -->  RMSE:  875.768671943671
Max Depth = 7  -->  RMSE:  795.6774213425058
Max Depth = 8  -->  RMSE:  704.7347077638515
Max Depth = 9  -->  RMSE:  654.5659225141459
Max Depth = 10  -->  RMSE:  621.9986729169274
Max Depth = 11  -->  RMSE:  604.0750135796088
Max Depth = 12  -->  RMSE:  610.1329406828772
Max Depth = 13  -->  RMSE:  625.8646195919637
Max Depth = 14  -->  RMSE:  631.6650055727374
Max Depth = 15  -->  RMSE:  663.9732013999609
Max Depth = 16  -->  RMSE:  679.7677126490364
Max Depth = 17  -->  RMSE:  684.6858897622647
Max Depth = 18  -->  RMSE:  675.5756733339491
Max Depth = 19  -->  RMSE:  691.5205666039923


the best RMSE comes with max_depth=11

In [70]:
tree = DecisionTreeRegressor(max_depth=11)
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)
print("MSE: ", mean_squared_error(y_test,y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("R2: ", r2_score(y_test,y_pred))

MSE:  365147.23857142986
RMSE:  604.274141902026
R2:  0.9774393040418509


# lets try other models:

In [71]:
from sklearn.ensemble import RandomForestRegressor

In [81]:
for i in range(12,25):
    forest = RandomForestRegressor(max_depth=i)
    forest.fit(X_train,y_train)
    y_pred = forest.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 12  -->  RMSE:  540.7544380257997
Max Depth = 13  -->  RMSE:  536.8425996690071
Max Depth = 14  -->  RMSE:  537.8256841050809
Max Depth = 15  -->  RMSE:  539.2720903620158
Max Depth = 16  -->  RMSE:  539.4178061900482
Max Depth = 17  -->  RMSE:  539.4740312642746
Max Depth = 18  -->  RMSE:  538.8441729204425
Max Depth = 19  -->  RMSE:  541.2236025978418
Max Depth = 20  -->  RMSE:  542.7489124731181
Max Depth = 21  -->  RMSE:  542.6827399020762
Max Depth = 22  -->  RMSE:  543.226452410092
Max Depth = 23  -->  RMSE:  541.9298874779803
Max Depth = 24  -->  RMSE:  541.6672664794535


In [76]:
from sklearn import linear_model
lasso = linear_model.Lasso()
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE:  1240.837896917951


lasso with alpha=0.1 is equicalent to LinearRegression

In [77]:
from sklearn.linear_model import ElasticNet
elas = ElasticNet(random_state=0)
elas.fit(X_train,y_train)
y_pred = elas.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE:  3018.5383996312653
