# Cross Validation using Scikit-Learn

#### Python Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

#### Load and display the Diamond Prices data set

Source: [R Data Sets - Diamond Pricing](https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Diamond.csv)

Data Dictionary:

    - carat : weight of diamond stones in carat unit
    - colour: a factor with levels (D,E,F,G,H,I). D is the highest quality
    - clarity: a factor with levels (IF,VVS1,VVS2,VS1,VS2). IF is the highest quality
    - certification: certification body, a factor with levels ( GIA, IGI, HRD)
    - price: price in Singapore dollars


In [2]:
url = './data/diamond-prices.csv'
diamond_prices_df = pd.read_csv(url, usecols=['carat', 'colour', 'clarity', 'certification', 'price'])
diamond_prices_df

Unnamed: 0,carat,colour,clarity,certification,price
0,0.30,D,VS2,GIA,1302
1,0.30,E,VS1,GIA,1510
2,0.30,G,VVS1,GIA,1510
3,0.30,G,VS1,GIA,1260
4,0.31,D,VS1,GIA,1641
...,...,...,...,...,...
303,1.01,I,VS1,HRD,8175
304,1.02,F,VVS2,HRD,10796
305,1.06,H,VVS2,HRD,9890
306,1.02,H,VS2,HRD,8959


#### Ensure there are no more missing values

In [3]:
diamond_prices_df.isnull().sum()

carat            0
colour           0
clarity          0
certification    0
price            0
dtype: int64

#### Using the data dictionary, create a Python dict of the ordinal features with a dictionary that maps the labels to their respective numerical values

In [4]:
ord_features_dict = {
    'colour': {'D': 6, 'E': 5, 'F': 4, 'G': 3, 'H': 2 , 'I': 1},
    'clarity': {'IF': 5, 'VVS1': 4, 'VVS2': 3, 'VS1': 2, 'VS2': 1}
}

#### Replace the ordinal feature labels with their numerical equivalent in the data set

In [5]:
for key, val in ord_features_dict.items():
    diamond_prices_df[key] = diamond_prices_df[key].map(val)
diamond_prices_df

Unnamed: 0,carat,colour,clarity,certification,price
0,0.30,6,1,GIA,1302
1,0.30,5,2,GIA,1510
2,0.30,3,4,GIA,1510
3,0.30,3,2,GIA,1260
4,0.31,6,2,GIA,1641
...,...,...,...,...,...
303,1.01,1,2,HRD,8175
304,1.02,4,3,HRD,10796
305,1.06,2,3,HRD,9890
306,1.02,2,1,HRD,8959


#### Create a dummy binary variable for the nominal feature `certification`

In [6]:
cert_encoded_df = pd.get_dummies(diamond_prices_df[['certification']], prefix_sep='.', sparse=False)
cert_encoded_df

Unnamed: 0,certification.GIA,certification.HRD,certification.IGI
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
303,0,1,0
304,0,1,0
305,0,1,0
306,0,1,0


#### Drop the nominal feature and replace with dummy variables

In [7]:
diamond_prices_df = diamond_prices_df.drop('certification', axis=1)
diamond_prices_df = pd.concat([diamond_prices_df, cert_encoded_df], axis=1)
diamond_prices_df

Unnamed: 0,carat,colour,clarity,price,certification.GIA,certification.HRD,certification.IGI
0,0.30,6,1,1302,1,0,0
1,0.30,5,2,1510,1,0,0
2,0.30,3,4,1510,1,0,0
3,0.30,3,2,1260,1,0,0
4,0.31,6,2,1641,1,0,0
...,...,...,...,...,...,...,...
303,1.01,1,2,8175,0,1,0
304,1.02,4,3,10796,0,1,0
305,1.06,2,3,9890,0,1,0
306,1.02,2,1,8959,0,1,0


#### Display the feature information about the data set

In [8]:
diamond_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   carat              308 non-null    float64
 1   colour             308 non-null    int64  
 2   clarity            308 non-null    int64  
 3   price              308 non-null    int64  
 4   certification.GIA  308 non-null    uint8  
 5   certification.HRD  308 non-null    uint8  
 6   certification.IGI  308 non-null    uint8  
dtypes: float64(1), int64(3), uint8(3)
memory usage: 10.7 KB


#### Create the training, validation, and test data sets

In [9]:
X_other, X_test, y_other, y_test = train_test_split(diamond_prices_df[['carat', 'colour', 'clarity', 'certification.GIA', 'certification.HRD', 'certification.IGI']], diamond_prices_df['price'], test_size=0.15, random_state=101)
X_train, X_val, y_train, y_val = train_test_split(X_other, y_other, test_size=0.15, random_state=101)

#### Display the shape (rows, columns) of the training data set

In [10]:
X_train.shape

(221, 6)

#### Display the shape (rows, columns) of the validation data set

In [11]:
X_val.shape

(40, 6)

#### Scale the training, validation, and test data sets

In [12]:
scaler = StandardScaler()
X_other_s = pd.DataFrame(scaler.fit_transform(X_other), columns=X_other.columns, index=X_other.index)
X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_s = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
X_test_s = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
X_train_s

Unnamed: 0,carat,colour,clarity,certification.GIA,certification.HRD,certification.IGI
99,0.320281,1.242712,-1.440025,0.995485,-0.575609,-0.575609
209,-0.586717,0.529363,-0.658095,-1.004535,-0.575609,1.737292
233,-0.441597,-0.897335,0.905765,-1.004535,1.737292,-0.575609
199,-1.167196,-0.897335,1.687695,-1.004535,-0.575609,1.737292
168,-1.602555,-0.183986,-1.440025,-1.004535,-0.575609,1.737292
...,...,...,...,...,...,...
73,0.574241,-1.610683,0.123835,0.995485,-0.575609,-0.575609
108,0.392841,-1.610683,-0.658095,0.995485,-0.575609,-0.575609
22,-1.058356,-0.183986,-0.658095,0.995485,-0.575609,-0.575609
304,1.372399,0.529363,0.123835,-1.004535,1.737292,-0.575609


## Hold Out Train-Validate-Test Cross Validation

#### Iteration 1 with hyperparameter `alpha=100`

In [13]:
model_one = Ridge(alpha=100)
model_one.fit(X_train_s, y_train)
y_val_pred1 = model_one.predict(X_val_s)
r2_score(y_val, y_val_pred1)

0.8393041847346832

#### Iteration 2 with hyperparameter `alpha=5`

In [14]:
model_two = Ridge(alpha=5)
model_two.fit(X_train_s, y_train)
y_val_pred2 = model_two.predict(X_val_s)
r2_score(y_val, y_val_pred2)

0.9528101881304383

#### Final model evaluation using the test set

In [15]:
y_pred = model_two.predict(X_test_s)
r2_score(y_test, y_pred)

0.9302466842403143

## k-Fold Cross Validation

#### Create a 5-fold split

In [16]:
folds_5 = KFold(n_splits=5, random_state=101, shuffle=True)

#### Iteration 1 with hyperparameter `alpha=100`

In [17]:
model_one = Ridge(alpha=100)
scores = cross_val_score(model_one, X_other_s, y_other, scoring='r2', cv=folds_5)
scores.mean()

0.8223482129601845

#### Iteration 2 with hyperparameter `alpha=5`

In [18]:
model_two = Ridge(alpha=5)
scores2 = cross_val_score(model_two, X_other_s, y_other, scoring='r2', cv=folds_5)
scores2.mean()

0.9541076637596422

#### Final model evaluation using the test set

In [19]:
model_two.fit(X_other_s, y_other)
y_pred = model_two.predict(X_test_s)
r2_score(y_test, y_pred)

0.9274938024707198