# Manipulate Validaiton Dataframe

In [1]:
import pandas as pd
import numpy as np 
import statistics
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import modelling as ml
import pickle
import gzip, pickle, pickletools
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeRegressor


In [2]:
# Import price predictive model from 04 
filepath = '../models/casi_dt_v1.pkl'
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    model = p.load()

In [3]:
df = pd.read_csv('../data/modelling_wines.csv')
df.set_index('name', inplace=True)
df = df[df['rating_qty'] > 30].copy()
df.head()

Unnamed: 0_level_0,region,country,vintage,producer,wine_variety,grape_variety,price,rating,rating_qty,abv,from_vivino,log_price,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
12 Linajes Reserva 2012,215.0,18.0,31.0,3.0,0.0,132.0,32.85,3.9,33.0,14.0,0.0,3.491952,12.0
14 Hands Cabernet Sauvignon 2018,80.0,20.0,37.0,6.0,0.0,18.0,9.85,3.7,31.0,13.5,0.0,2.287471,6.0
19 Crimes Cabernet Sauvignon 2020,305.0,1.0,39.0,8.0,0.0,18.0,9.03,3.9,68.0,13.599021,0.0,2.200552,4.0
19 Crimes The Punishment Pinot Noir 2020,305.0,1.0,39.0,8.0,0.0,95.0,8.21,3.7,31.0,13.5,0.0,2.105353,4.0
19 Crimes Snoop Cali Rose 2020,47.0,20.0,39.0,8.0,1.0,117.0,11.49,3.9,32.0,10.5,0.0,2.441477,4.0


# Trying to predict ratings without price info 

In [4]:
# Drop 'price' and 'log_price' from features, 'rating' is the target
X = df.drop(['rating', 'price', 'log_price', 'rating_qty'], axis=1)
y = df['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.85, random_state=42)

In [5]:

# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.85, random_state=42)

# xgb = xgb.XGBRegressor(objective ='reg:squarederror')

# # Fit the model to the training set
# xgb.fit(X_train, y_train)

In [6]:
# # XGB Model can predict rating of a wine to within 0.17 stars. 
# y_pred = xgb.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# print(f"Root mean Squared Error: {np.sqrt(mse)}")
# print(f"Cross validated r2: {cross_val_score(xgb, X_train, y_train, cv = 5, scoring='r2').mean()}")

In [7]:
# results = ml.regression_model_selector(X_train, y_train, X_test, y_test)
# results

**Results of other model types tested:**
|index |model|rmse|cv|train_score|test_score|varience|
|---|---|---|---|---|---|---|
|0|lr|0.228208|0.418706|0.422285|0.447992|-0.025707|
|1|ridge|0.228212|0.418714|0.422285|0.447974|-0.025690|
|2|knn|0.177761|0.601966|0.819220|0.665070|0.154150|
|3|dt|0.248299|0.315301|0.999968|0.346519|0.653449|
|4|rf|0.182799|0.620068|0.949541|0.645814|0.303728|
|5|adab|0.271035|0.229351|0.217793|0.221368|-0.003575|
|6|svr|0.298666|0.046364|0.052351|0.054516|-0.002166|

## Build Model That Predicts Price Without Rating

In [9]:
# Building price predictive model that doesn't use rating_qty
X = df.drop(['price', 'log_price', 'rating', 'rating_qty'], axis=1)
y = df['log_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.85, random_state=42)

dt_tuned_no_rating = DecisionTreeRegressor(max_depth = None, min_samples_leaf = 1, min_samples_split = 3, random_state=42)

dt_tuned_no_rating .fit(X_train, y_train)

y_pred = dt_tuned_no_rating .predict(X_test)

print(f"Root Mean Squared Error (RMSE): {round(np.exp(np.sqrt(mean_squared_error(y_test, y_pred))),3)}")
print(f"R-squared (R2): {round(r2_score(y_test, y_pred),3)}")

print(f"Cross validated r2: {round(cross_val_score(dt_tuned_no_rating , X_train, y_train, cv = 5).mean(), 3)}")

Root Mean Squared Error (RMSE): 1.554
R-squared (R2): 0.778
Cross validated r2: 0.764


In [10]:
filepath = '../models/casi_dt_no_rating.pkl'
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(dt_tuned_no_rating)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

# Impute the Rating and Then Predict Price on Validation Set 

In [9]:
# do same steps to validaiton set
df_val = pd.read_csv('../data/validation_modelling.csv')
df_val.set_index('name', inplace=True)
df_val.head()

Unnamed: 0_level_0,region,country,vintage,producer,wine_variety,grape_variety,price,rating,rating_qty,abv,from_vivino,log_price,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Bread Butter Winemakers Selection Chardonnay 202021 California,47.0,20.0,42.0,789.0,3.0,23.0,15.99,0.0,0.0,13.5,2.0,2.771964,3.0
Oyster Bay Sauvignon Blanc 2022 Marlborough,154.0,13.0,42.0,4960.0,3.0,123.0,12.49,0.0,0.0,13.0,2.0,2.524928,2.0
Louis Latour Maconlugny 202122,42.0,5.0,42.0,4281.0,3.0,23.0,17.99,0.0,0.0,13.5,2.0,2.889816,2.0
Bread Butter Winemakers Selection Pinot Noir 2021 California,47.0,20.0,42.0,789.0,0.0,95.0,15.99,0.0,0.0,13.5,2.0,2.771964,3.0
Louis Roederer Cristal Champagne 2015,305.0,5.0,42.0,4286.0,3.0,23.0,300.0,0.0,0.0,12.0,2.0,5.703782,9.0


In [10]:
# Predict ratings using XGB model
X_impute = df_val.drop(['rating', 'price', 'log_price', 'rating_qty'], axis=1)
rating_predictions = xgb.predict(X_impute)
df_val['rating'] = rating_predictions

In [11]:
# Predict prices with predicted ratings
X = df_val.drop(['price', 'log_price', 'rating_qty'], axis=1)

log_price_predictions = dt_tuned_for_val.predict(X)

In [12]:
df_val['price_predicted'] = np.exp(log_price_predictions)


# Attempting to do it all WITHOUT ratings

In [17]:

X = df.drop(['rating', 'price', 'log_price', 'rating_qty'], axis=1)
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.85, random_state=42)

xgb_price = xgb.XGBRegressor(objective ='reg:squarederror')

xgb_price.fit(X_train, y_train)

# XGB Model can predict rating of a wine to within 0.17 stars. 
y_pred = xgb_price.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Root mean Squared Error: {np.sqrt(mse)}")
print(f"Cross validated r2: {cross_val_score(xgb_price, X_train, y_train, cv = 5, scoring='r2').mean()}")

AttributeError: 'XGBRegressor' object has no attribute 'XGBRegressor'