# Modelling

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import statistics
from visualisations import compare_histograms
import random
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import modelling as ml
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler
import modelling as ml
from sklearn.ensemble import RandomForestRegressor

sns.set_theme(context='notebook', style='darkgrid', palette='Set3')

In [2]:
df = pd.read_csv('../data/clean_test_train.csv')
fillabv = pd.read_csv('../data/modelling_wines.csv')
fillabv.set_index('name', inplace=True)
df.set_index('name', inplace=True)
df.head()

Unnamed: 0_level_0,region,country,vintage,producer,wine_variety,grape_variety,price,rating,rating_qty,abv,reviewed_by,from_vivino,age,log_price,log_rating_qty
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Starkconde Syrah 2017,Stellenbosch,South Africa,2017,Starkconde,red,Syrah,22.79,3.9,129,,unknown,True,7,3.126322,4.859812
Montedidio Ostuni 2019,Puglia,Italy,2019,Montedidio,rose,Unknown,6.29,4.0,347,,unknown,True,5,1.838961,5.849325
Alto Estate Cabernet Sauvignon 2014,Stellenbosch,South Africa,2014,Alto Estate,sparkling,Cabernet Sauvignon,19.26,4.1,157,,unknown,True,10,2.95803,5.056246
Chateau Belgrave Hautmedoc Grand Cru Classe 2016,Hautmedoc,France,2016,Chateau Belgrave,white,Unknown,40.19,3.9,199,,unknown,True,8,3.693618,5.293305
Tate Spring Street Cabernet Sauvignon 2019,Other,United States,2019,Tate Spring Street,red,Cabernet Sauvignon,80.99,4.7,24,14.3,0,False,5,4.394326,3.178054


In [3]:
# Binary encode select categoricals: country, region, vintage
df_encoded = pd.get_dummies(df, columns=['region', 'country', 'vintage'],  drop_first=True, dtype=int)

# Insert imputed abv from workbook 02
df_encoded['abv'] = fillabv['abv']

# Include some feature engineering
df_encoded['age^2'] = df_encoded['age'] ** 2
df_encoded['rating * log_rating_qty'] = df_encoded['rating'] * df_encoded['log_rating_qty']

# Baseline Model

In [6]:
X = df_encoded.drop(columns = ['log_price', 'reviewed_by', 'producer', 'wine_variety',
                                'grape_variety', 'price', 'from_vivino'])
y = df_encoded['log_price']

ss = StandardScaler()

X = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

lr = LinearRegression()

lr_model, y_pred = ml.model_summary(lr,X_train, X_test, y_train, y_test, log_target=True)

Exponent Root Mean Squared Error (RMSE): 1.517
Root Mean Squared Error (RMSE): 0.416
Train R-squared (R2): 0.769
Test R-squared (R2): 0.762
High varience: [ 7.57563563e-01 -1.74409505e+23  7.70702367e-01  7.55736334e-01
  7.60887150e-01  7.73438627e-01  7.61390506e-01  7.54745884e-01
 -8.87778764e+22  7.64313244e-01]
Cross validated r2: -2.631873815306902e+22


In [9]:
coefs = pd.DataFrame(lr_model.coef_, columns=.columns, index=X.index)
coefs

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

**Without SS:**
Exponent Root Mean Squared Error (RMSE): 1.517
Root Mean Squared Error (RMSE): 0.417
Train R-squared (R2): 0.769
Test R-squared (R2): 0.762
High varience: [-3.22956837e+06  7.62924410e-01  7.66956281e-01  7.57706231e-01
 -8.59826455e+13]
Cross validated r2: -17196529736415.965

In [9]:
results = ml.regression_model_selector(X_train, y_train, X_test, y_test, quick =True)
results

Unnamed: 0,model,rmse,cv,train_score,test_score,varience
0,lr,6220193000.0,-2.464796e+20,0.508941,-3.764926e+19,3.764926e+19
1,ridge,0.7054706,0.5036476,0.508912,0.5157088,-0.006796554
2,knn,0.5886148,0.6353469,0.762586,0.6628593,0.09972674
3,dt,0.4370028,0.7996821,0.94187,0.8141694,0.1277002


In [18]:
results_ss = ml.regression_model_selector(X_train, y_train, X_test, y_test, quick = True, ss= True)
results_ss

Unnamed: 0,model,rmse,cv,train_score,test_score,varience
0,lr,6220193000.0,-2.464796e+20,0.508941,-3.764926e+19,3.764926e+19
1,ridge,0.7054706,0.5036476,0.508912,0.5157088,-0.006796554
2,knn,0.5886148,0.6353469,0.762586,0.6628593,0.09972674
3,dt,0.4370028,0.7996821,0.94187,0.8141694,0.1277002


# PCA With Producers 

In [10]:
X = df_encoded.drop(columns = ['log_price', 'reviewed_by'])
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)
pca = PCA(n_components=30)

pca.fit(X_train)

Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)
lr.fit(Z_train, y_train)

print(f'Training Score: {round(lr.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(lr.score(Z_test, y_test),4)}')

y_pred = lr.predict(Z_test)

print(f" Cross validated r2: {cross_val_score(lr, Z_train, y_train, cv = 5)}")
print(f" Root mean squared error: {np.sqrt(mean_squared_error(y_test, y_pred))}")

ValueError: could not convert string to float: 'Bolgheri Superiore'