# Modelling

In [12]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import statistics
from visualisations import compare_histograms
import random
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import modelling as ml
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

# set random seed for reproducibility
np.random.seed(42)

sns.set_theme(context='notebook', style='darkgrid', palette='Set3')

In [13]:
df = pd.read_csv('../data/modelling_wines.csv')
# set name column as index
df.set_index('name', inplace=True)
df.head()

Unnamed: 0_level_0,region,country,vintage,producer,wine_variety,grape_variety,rating,rating_qty,abv,reviewed_by,from_vivino,log_price
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00 Wines Vgw Chardonnay 2017,Willamette Valley,United States,2017,00 Wines,white,Chardonnay,0.0,0,13.0,"JS, WE, WS",False,4.189655
00 Wines Vgr Pinot Noir 2019,Willamette Valley,United States,2019,00 Wines,red,Pinot Noir,0.0,0,12.9,"WE, JS, RP",False,4.406719
00 Wines Egw Chardonnay 2019,Willamette Valley,United States,2019,00 Wines,white,Chardonnay,0.0,0,,"JS, RP, WE",False,4.553877
100 Nails Ranch Pinot Noir 2019,Central Coast,United States,2019,100 Nails Ranch,red,Pinot Noir,0.0,0,14.2,WW,False,2.639057
100 Nails Ranch Chardonnay 2019,Sonoma County,United States,2019,100 Nails Ranch,white,Chardonnay,0.0,0,14.2,0,False,2.639057


In [14]:
# df_encoded = pd.get_dummies(df, columns=['region', 'country', 'vintage', 'wine_variety', 
#                                         'grape_variety', 'from_vivino', 'producer'],  drop_first=True, dtype=int)

# Baseline Model

Basic Linear Regression with nulls imputed from median 

In [15]:
# df_encoded['abv'].fillna(df_encoded['abv'].median(), inplace=True)

In [16]:
lr = LinearRegression()

# lr.fit(X_train, y_train)
# y_pred = lr.predict(X_test)
# print(f" Cross validated r2: {cross_val_score(lr, X_train, y_train, cv = 10,scoring='r2')}")
# print(f" Mean squared error: {np.sqrt(mean_squared_error(y_test, y_pred))}")

In [17]:
df_encoded = pd.get_dummies(df, columns=['country', 'vintage', 'wine_variety', 
                                        'grape_variety', 'from_vivino'],  drop_first=True, dtype=int)
df_encoded['abv'].fillna(df_encoded['abv'].median(), inplace=True)

X = df_encoded.drop(columns = ['log_price', 'reviewed_by', 'producer', 'region'])
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

In [7]:
lr = LinearRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f" Cross validated r2: {cross_val_score(lr, X_train, y_train, cv = 10,scoring='r2')}")
print(f" Root mean squared error: {np.sqrt(mean_squared_error(y_test, y_pred))}")

 Cross validated r2: [0.51899004 0.48791673 0.49072158 0.51696956 0.51497948 0.49164686
 0.48698598 0.51464692 0.50723052 0.51276863]
 Root mean squared error: 0.7055844842893534


In [9]:
results = ml.regression_model_selector(X_train, y_train, X_test, y_test, quick =True)
results

Unnamed: 0,model,rmse,cv,train_score,test_score,varience
0,lr,6220193000.0,-2.464796e+20,0.508941,-3.764926e+19,3.764926e+19
1,ridge,0.7054706,0.5036476,0.508912,0.5157088,-0.006796554
2,knn,0.5886148,0.6353469,0.762586,0.6628593,0.09972674
3,dt,0.4370028,0.7996821,0.94187,0.8141694,0.1277002


In [18]:
results_ss = ml.regression_model_selector(X_train, y_train, X_test, y_test, quick = True, ss= True)
results_ss

Unnamed: 0,model,rmse,cv,train_score,test_score,varience
0,lr,6220193000.0,-2.464796e+20,0.508941,-3.764926e+19,3.764926e+19
1,ridge,0.7054706,0.5036476,0.508912,0.5157088,-0.006796554
2,knn,0.5886148,0.6353469,0.762586,0.6628593,0.09972674
3,dt,0.4370028,0.7996821,0.94187,0.8141694,0.1277002


# PCA With Producers 

In [10]:
X = df_encoded.drop(columns = ['log_price', 'reviewed_by'])
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)
pca = PCA(n_components=30)

pca.fit(X_train)

Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)
lr.fit(Z_train, y_train)

print(f'Training Score: {round(lr.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(lr.score(Z_test, y_test),4)}')

y_pred = lr.predict(Z_test)

print(f" Cross validated r2: {cross_val_score(lr, Z_train, y_train, cv = 5)}")
print(f" Root mean squared error: {np.sqrt(mean_squared_error(y_test, y_pred))}")

ValueError: could not convert string to float: 'Bolgheri Superiore'