In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
pd.options.display.max_columns = None

In [2]:
data = pd.read_csv('./winemag_data_130k_v2.csv')

In [3]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [5]:
data.describe(include = 'all')

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
count,129971.0,129908,129971,92506,129971.0,120975.0,129908,108724,50511,103727,98758,129971,129970,129971
unique,,43,119955,37979,,,425,1229,17,19,15,118840,707,16757
top,,US,"Seductively tart in lemon pith, cranberry and ...",Reserve,,,California,Napa Valley,Central Coast,Roger Voss,@vossroger,Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma...,Pinot Noir,Wines & Winemakers
freq,,54504,3,2009,,,36247,4480,11065,25514,25514,11,13272,222
mean,64985.0,,,,88.447138,35.363389,,,,,,,,
std,37519.540256,,,,3.03973,41.022218,,,,,,,,
min,0.0,,,,80.0,4.0,,,,,,,,
25%,32492.5,,,,86.0,17.0,,,,,,,,
50%,64985.0,,,,88.0,25.0,,,,,,,,
75%,97477.5,,,,91.0,42.0,,,,,,,,


In [6]:
data.isna().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [7]:
data.country = data.country.fillna('Unknown')

In [8]:
data = data.drop(['designation'], axis = 1)

In [9]:
data.price = data.price.fillna(data.price.mean())

In [10]:
data.province = data.province.fillna('Unknown')

In [11]:
data = data.drop(['taster_name', 'taster_twitter_handle'], axis = 1)

In [12]:
data.variety = data.variety.fillna('Unknown')

In [13]:
data = data.drop(['title'], axis = 1)

In [14]:
data.region_1 = data.region_1.fillna('Unknown')

In [15]:
data = data.drop(['region_2'], axis = 1)

In [16]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [17]:
data = data.drop(['description'], axis = 1)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   129971 non-null  object 
 1   points    129971 non-null  int64  
 2   price     129971 non-null  float64
 3   province  129971 non-null  object 
 4   region_1  129971 non-null  object 
 5   variety   129971 non-null  object 
 6   winery    129971 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 6.9+ MB


In [19]:
data.describe(include = 'all')

Unnamed: 0,country,points,price,province,region_1,variety,winery
count,129971,129971.0,129971.0,129971,129971,129971,129971
unique,44,,,426,1230,708,16757
top,US,,,California,Unknown,Pinot Noir,Wines & Winemakers
freq,54504,,,36247,21247,13272,222
mean,,88.447138,35.363389,,,,
std,,3.03973,39.577066,,,,
min,,80.0,4.0,,,,
25%,,86.0,18.0,,,,
50%,,88.0,28.0,,,,
75%,,91.0,40.0,,,,


In [20]:
y = data['price']

In [21]:
X = data.loc[:, ['country', 'points', 'province', 'region_1', 'variety', 'winery']]

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   country   129971 non-null  object
 1   points    129971 non-null  int64 
 2   province  129971 non-null  object
 3   region_1  129971 non-null  object
 4   variety   129971 non-null  object
 5   winery    129971 non-null  object
dtypes: int64(1), object(5)
memory usage: 5.9+ MB


In [23]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse = False, handle_unknown = 'ignore'),
    ['country', 'province', 'region_1', 'variety', 'winery'])
], remainder = 'passthrough').fit(X)

X = ct.transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                     test_size = 0.3,
                                                     random_state = 1)

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                     test_size = 0.3,
                                                     random_state = 1)

In [26]:
model = RandomForestRegressor(random_state = 1,
                             n_jobs = -6,
                             max_depth = 9,
                             n_estimators = 800)

In [27]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=9, n_estimators=800, n_jobs=-6, random_state=1)

In [28]:
mse = mean_squared_error
msle = mean_squared_log_error

In [29]:
pred = model.predict(X_train)
score = model.score(X_train, y_train)
mse_score = mse(y_train, pred)
msle_score = msle(y_train, pred)

print(f'train_score: {score}')
print(f'train_mse : {mse_score}')
print(f'train_msle :  {msle_score}')

train_score: 0.6098648107511387
train_mse : 669.0640640110227
train_msle :  0.2110078216185681


In [30]:
pred = model.predict(X_valid)
score = model.score(X_valid, y_valid)
mse_score = mse(y_valid, pred)
msle_score = msle(y_valid, pred)

print(f'valid_score: {score}')
print(f'valid_mse : {mse_score}')
print(f'valid_msle :  {msle_score}')

valid_score: 0.2966474894945098
valid_mse : 789.5346194022812
valid_msle :  0.21462372928966442
