In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas as gpd


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

"""import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))"""

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
#read the file
reviews=pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv",index_col=0)

#look at composition
reviews.info()

#drop duplicates
reviews=reviews.drop_duplicates()
reviews.info()

review=reviews.copy()

In [None]:
#preview dataframe
reviews.head()

In [None]:
sns.heatmap(review.isnull(),yticklabels=False,cbar=False,cmap='inferno').set(title="Missing Values by Column")
review.isnull().sum()

In [None]:
#drop columns with high percentage of unknown data
reviews.pop("region_2")

#replace null values of price column with the median
reviews.price.fillna(reviews.price.dropna().median(),inplace=True)

#replace all missing values with Unknown
reviews= reviews.fillna("Unknown")

reviews.info()
reviews.isnull().sum()

In [None]:
reviews.head(10)

In [None]:
#top 10 countries that appear most often,bar graph
top10=reviews.groupby('country').size().reset_index(name="count").sort_values('count',ascending=False)
top10=top10.head(10)
top10=top10.set_index('country')
graph=sns.barplot(x=top10.index,y=top10['count'],edgecolor="black")
graph.set_yticklabels(['0','10k','20k','30k','40k','50k'])
graph.set_xticklabels(graph.get_xticklabels(), rotation=40, ha="right")
graph.set(title="Top 10 Most Frequently Appearing Countries in the Reviews")
top10

In [None]:
#price distribution, top 10 countries
price=reviews.groupby('country').price.agg(['count','min','max','mean']).reset_index().sort_values('count',ascending=False)
price=price.head(10)
price.style.set_caption("Price Distribution of Top 10 Countries")

**Distribution of Wine Reviews Map**

In [None]:
import plotly.express as px
country = reviews.groupby('country').size().reset_index(name='count').sort_values('count', ascending=False)
px.choropleth(country, locations = 'country', locationmode='country names', color = 'count', template = 'simple_white',range_color=[2000,50000], color_continuous_scale='peach',projection="natural earth")

**Countries With Best Point Scores**

In [None]:
#find out country with highest average points
best_country=reviews.sort_values(by=['country','points'],ascending=[False,False])
best_country=best_country.groupby(['country']).points.agg(['mean','count']).reset_index().sort_values('mean',ascending=False)
best_country.columns=['country','average_points','count']
plt.figure(figsize=(13,10))
sns.barplot(x=best_country['average_points'],y=best_country['country']).set(title="Average Points by Country")


inconclusive because there's too much variance in the amount of entries happening
ex. US average points is pretty low, but it has way more occurences in the table

**Find Out The Most Popular Location For A Winery**

In [None]:
#find out the most popular location for a winery
explode = (0.1, 0.1, 0.2, 0.2, 0.0,0.2,0.0,0.1,0.3,0.0)
#wedge properties
wp = { 'linewidth' : 1, 'edgecolor' : "black" }
location=reviews.groupby(['country','province']).size().reset_index(name="count").sort_values('count',ascending=False)
location=location.head(10)
location=location.set_index('province')
pie=location.plot.pie(y='count',autopct='%.1f%%',shadow=True,explode=[0.25]*10,wedgeprops=wp,figsize=(6,15))
pie.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=5)

location.style.set_caption("Most Popular Location for a Winery")


**Province With Highest Points Average**

In [None]:
#find out province with highest average points
best_prov=reviews.sort_values(by=['province','points'],ascending=[False,False])
best_prov=best_prov.groupby(['country','province']).points.agg(['mean','count']).reset_index().sort_values('mean',ascending=False)
best_prov.columns=['country','province','average_points','count']
bp=best_prov.head(20)
graph=sns.barplot(x=bp['average_points'],y=bp['province'])
bp

**Visiual Representation of Average Point Scores of Each Country**

Took average points for each province and related it back to it's country

In [None]:
fig = plt.gcf()
fig.set_size_inches(12, 8)
sns.stripplot(x=best_prov['average_points'],y=best_prov['country'],size=5).set(title="Average Points by Country")

**Find Out Which Specific Winery Has the Best Wine**

In [None]:
#find out which winery has the best wine
best=reviews.sort_values(by=['winery','points'], ascending=[False,False])
best=best.groupby(['country','province','winery']).points.agg(['mean','count']).reset_index().sort_values('mean',ascending=False)
best.columns=['country','province','winery','mean_points','count']
best=best[best['count']>3]
best=best.set_index('winery')
best=best.head(10)
best

**Map of Best Winery Locations**

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

df = pd.DataFrame(
    {'Winery': ['Tenuta dell\'Ornellaia', 'Salon', 'Cardinale', 'Domaine des Lambrays', 'Colgin','Château Climens',
                'Château Léoville Barton','Horsepower','Dolce','Vieux Château Certan'],
     'Country': ['Italy', 'France', 'US', 'France', 'US','France','France','US','US','France'],
     'Latitude': [43.548473, 48.946840, 38.439030, 47.197121, 38.506931,44.606041,45.1573994,46.0647752,38.4235022,44.9269249],
     'Longitude': [10.310567, 4.018230, -122.404701, 4.963180, -122.462143,-0.336540,-0.7391178,-118.3422892,-122.4370167,-0.1999426]})
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))
 
ax = world.boundary.plot(edgecolor='black',cmap='rainbow',figsize=(15,25));

# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red',markersize=10)
plt.show()
gdf

**Relationship b/t Price and Points**

In [None]:
#scatterplot chart(relationship of points and price)
#sns.regplot(x=reviews['points'], y=reviews['price'], scatter_kws={"color": "black"}, line_kws={"color": "red"}).set(title="Relationship B/T Points and Price")
sns.boxplot(x=reviews['points'], y=reviews['price'],whis=np.inf)
sns.stripplot(x=reviews['points'], y=reviews['price']).set(title="Relationship B/T Points and Price")
whis=np.inf

In [None]:
#points and mean price relationship
top = reviews.sort_values(by=['points','price'], ascending=[False,True])
top=top.groupby('points').price.agg(['mean','min','max']).reset_index().sort_values('points',ascending=False)
top.columns=['points','mean_price','min_price','max_price']
top=top.set_index('points')
plt.figure(figsize=(10,7))
sns.lineplot(data=top,linewidth=3).set(title="Relationship B/T Mean Price and Points")
top.head(10)


As price increase, so do points

In [None]:
#review by points, how many reviews received a certain point value
pt=reviews.groupby('points').size().reset_index(name="Review Count").sort_values('points',ascending=False)
pt=pt.set_index('points')
graph=sns.barplot(x=pt.index,y=pt['Review Count']).set(title="Review Count by Points")


In [None]:
explode = (0.1, 0.1, 0.2, 0.2, 0.0)
#wedge properties
wp = { 'linewidth' : 1, 'edgecolor' : "black" }
#pie chart, US provinces
us=reviews[reviews.country=='US'].groupby('province').size().reset_index(name="count").sort_values('count',ascending=False)
us=us.head()
us=us.set_index('province')
pie=us.plot.pie(x='province',y='count',autopct="%.1f%%",shadow=True)
pie.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=5)
pie.set(title="Distribution of US provinces")

#pie chart, Italy provinces
italy=reviews[reviews.country=='Italy'].groupby('province').size().reset_index(name="count").sort_values('count',ascending=False)
italy=italy.head()
italy=italy.set_index('province')
pie2=italy.plot.pie(x='province',y='count',autopct='%.1f%%',shadow=True,explode=explode,wedgeprops=wp)
pie2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=5)
pie2.set(title="Distribution of Italy's provinces")


In [None]:

#most popular variety
variety=reviews.groupby('variety').size().reset_index(name="count").sort_values('count',ascending=False)
variety=variety.head(15)
variety=variety.set_index('variety')
variety

graph=sns.barplot(x=variety['count'],y=variety.index)
graph.set(title="Most Popular Varieties")

 #note try to add the percentage on top of each bar

**Best Variety**

In [None]:
#best_var=reviews.groupby('variety').points.agg(['mean']).reset_index().sort_values('mean',ascending=False)
best_var=reviews.groupby('variety').points.agg(['count','mean']).reset_index().sort_values('count',ascending=False)
best_var.columns=['variety','count','mean_points']
best_var=best_var.head(15)
graph=sns.swarmplot(x=best_var['mean_points'],y=best_var['variety'])
#graph.set_xticklabels(graph.get_xticklabels(), rotation=90)
graph.set(title="Most Popular Varieties Average Points")

**Taster_Info**

In [None]:
#look at the relationship between a taster and points
tasters=reviews.groupby('taster_name').points.agg(['mean']).reset_index().sort_values('mean',ascending=False)
tasters.columns=['taster_name','mean_points']
tasters=tasters.head(15)
sns.swarmplot(x=tasters['mean_points'],y=tasters['taster_name'])

**Region_1**

In [None]:
#look at the relationship between region_1 and points
best_region= reviews.groupby(['region_1','province','country']).points.agg(['mean']).reset_index().sort_values('mean',ascending=False)
best_region.columns=['region_1','province','country','mean_points']
best_region=best_region.head(15)
best_region
sns.swarmplot(x=best_region['mean_points'],y=best_region['region_1'])

**Just Scratch Work For Model Validation and Machine Learning**

In [None]:
#cross validation, machine learning component to calculate price
wine_file = '../input/wine-reviews/winemag-data-130k-v2.csv'
wine_data = pd.read_csv(wine_file) 
wine_data.columns
wine_data = wine_data.dropna(axis=0)

y = wine_data.points

features=['price']

X=wine_data[features]

X.describe()


In [None]:
X.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor


# Define model. Specify a number for random_state to ensure same results each run
wine_model = DecisionTreeRegressor(random_state=1)

# Fit model
wine_model.fit(X, y)

print("Making Predictions For 5 items:")
print(X.head())
print("Predictions are:")
print(wine_model.predict(X.head()))

In [None]:
#Calculate mean absolute error
from sklearn.metrics import mean_absolute_error

predicted_wine_prices = wine_model.predict(X)
mean_absolute_error(y, predicted_wine_prices)

In [None]:
#model validation
from sklearn.model_selection import train_test_split

#split data into training and validation data, for both features and target
train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=0)

#define model
wine_model=DecisionTreeRegressor()

#fit model
wine_model.fit(train_X,train_y)

#get predicted prices on validation data
val_preds= wine_model.predict(val_X)
print(mean_absolute_error(val_y,val_preds))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

wine_model = RandomForestRegressor(random_state=1)
wine_model.fit(train_X, train_y)
wine_preds = wine_model.predict(val_X)
print(mean_absolute_error(val_y, wine_preds))

**Experimenting with different approaches**

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from eli5 import show_weights
from sklearn.model_selection import cross_val_score

wine_file_path = '../input/wine-reviews/winemag-data-130k-v2.csv'
wine = pd.read_csv(wine_file_path)

price_avg = wine["price"].mean()

wine['price'].fillna(price_avg, inplace = True)

wine = wine[['country', 'province', 'region_1', 'winery', 'price', 'points', 'variety', 'title', 'taster_name', 'description']]
wine.rename(columns={'region_1':'region'}, inplace = True)

wine = wine.dropna(axis=0)
wine.head()

In [None]:
#Label Encoding 
#convert a column to a category, then use those category values for the label encoding
wine["country"] = wine["country"].astype('category')
wine["description"] = wine["description"].astype('category')
wine["province"] = wine["province"].astype('category')
wine["region"] = wine["region"].astype('category')
wine["taster_name"] = wine["taster_name"].astype('category')
wine["title"] = wine["title"].astype('category')
wine["variety"] = wine["variety"].astype('category')
wine["winery"] = wine["winery"].astype('category')

#assign the encoded variable to a new column using the cat.codes accessor:
wine["country codes"] = wine["country"].cat.codes
wine["description codes"] = wine["description"].cat.codes
wine["province codes"] = wine["province"].cat.codes
wine["region codes"] = wine["region"].cat.codes
wine["taster codes"] = wine["taster_name"].cat.codes
wine["title codes"] = wine["title"].cat.codes
wine["variety codes"] = wine["variety"].cat.codes
wine["winery codes"] = wine["winery"].cat.codes

In [None]:
wine.head()

In [None]:
"""from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_x, train_y)
forest_preds = forest_model.predict(val_x)
print("Printing MAE for RandomForest Model:",mean_absolute_error(val_y, forest_preds)) """


In [None]:
perm = PermutationImportance(basic_model, random_state=1).fit(val_x, val_y)
eli5.show_weights(perm, feature_names = val_x.columns.tolist())

Cross validation

Is a way to get a more accurate measure of the model's quality. First, define a pipeline, which will fill in the missing values. A random forest model will make the predictions.

In [None]:
#choosing the prediction target
y = wine.points

#choosing features
wine_features = ['price', 'country codes', 'province codes', 'variety codes', 'winery codes', 'region codes']
X = wine[wine_features]

#testing
#X.describe()
X.head()

In [None]:
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model',
                               RandomForestRegressor(n_estimators=50,random_state=0))])

In [None]:
points_CV = -1 * cross_val_score(my_pipeline, X, y, cv=5, 
                              scoring = 'neg_mean_absolute_error')
print("Using Cross Validation..\nMean Absolute Error points:\n",
      points_CV)

5 splits of the data to compute the MAE

Pipeline

In [None]:
pipe = pd.read_csv(wine_file_path)

pipe.dropna(axis=0, inplace=True)
y = pipe.points
X_train_full, X_valid_full, y_train, y_valid = train_test_split(pipe_data, y, 
                                                                train_size=0.8, test_size=0.2,random_state=0)
# Select categorical columns
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10000 and 
                    X_train_full[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
#X_test = X_test_full[my_cols].copy()


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

 
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)


# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])


# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)


# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE Using Pipeline:', mean_absolute_error(y_valid, preds))

**ORDINAL ENCODING**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('../input/wine-reviews/winemag-data-130k-v2.csv')

data.price.fillna(data.price.dropna().median(),inplace=True)
data['country'].fillna("UNKNOWN", inplace =True)
data['taster_name'].fillna("UNKNOWN",inplace=True)
data['region_1'].fillna("UNKNOWN",inplace=True)
data.pop("Unnamed: 0")
data.pop("region_2")

# Separate target from predictors
y = data.points
X = data.drop(['points'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)



# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = ['region_1','taster_name','winery']

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

data.isna().sum()


In [None]:
X_train

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

Ordinal is giving us the lowest MAE so far

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown='ignore')
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

More Approaches

The One Hot Encoder is not working yet

In [None]:
"""
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
"""


**PIPELINES**

Training and Valid split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('../input/wine-reviews/winemag-data_first150k.csv')
data.pop("Unnamed: 0")
# Separate target from predictors
y = data.points
X = data.drop(['points'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = ['country','variety']

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
X_train.head()

Preprocessing Steps

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Define Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

Create and Evaluate Pipeline

In [None]:
"""from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)"""

**Train Regression Model, SHAP Values**

CatBoost works the best with a high amount of categorical values

Train/Test Split

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('../input/wine-reviews/winemag-data-130k-v2.csv')

data.price.fillna(data.price.dropna().median(),inplace=True)
data['country'].fillna("UNKNOWN", inplace =True)
data['province'].fillna("UNKNOWN",inplace=True)
data.pop("Unnamed: 0")
data.pop("region_2")
data= data.fillna("Unknown")



# we will toss out the target variable 'points' from our input data features
X=data.drop(columns=['points'])

#Filling the null values since CatBoost can't handle missing null values
X=X.fillna(0)

#To be used during Catboost's Feature Importance extractor
categorical_features_indices =np.where(X.dtypes == np.object)[0]

y=data['points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=101)


In [None]:
categorical_features_indices

Creating/Training the Model

In [None]:
from catboost import Pool, CatBoostRegressor, cv

model = CatBoostRegressor(random_seed = 350,loss_function = 'RMSE',iterations=350)
#fitting the train data
model.fit(X_train, y_train,cat_features = categorical_features_indices,verbose=False)

Predictions/Evaluations

In [None]:
predictions=model.predict(X_test)

fig=plt.figure(figsize=(8,8))

plt.scatter(y_test,predictions,color='lightblue')
# Plot-label
fig.suptitle('y_test vs predictions',fontsize = 20)

#X-label
plt.xlabel('y_test')

# Y-label
plt.ylabel('predcitions')

Loss Functions

In [None]:
from sklearn import metrics
print('Mean Absolute Error     MAE:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error      MSE:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Accuracy:

In [None]:
errors = abs(predictions - y_test)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)


print('Accuracy_CatBoost:', round(accuracy, 2))

**SHAP values**

In [None]:
import shap

shap_values = model.get_feature_importance(Pool(X_test, label=y_test, cat_features=categorical_features_indices),type="ShapValues")
shap_values = shap_values[:, :-1]
shap.summary_plot(shap_values, X_test, plot_type="bar")


Summary Plot

In [None]:
shap.summary_plot(shap_values, X_test)

* Each dot in the visualization represents one prediction. 
* The color is related to the real data point. If the actual value in the dataset was high, the color is pink; blue indicates the actual value being low. 
* Grey represents the categorical values which cannot be scaled in high or low.

*What we Learned*
1. price & winery features contribute most to the prediction result
2. description, title and province contribute the least to predicting the target


**Force Plots:**

In [None]:
shap_values = model.get_feature_importance(Pool(X_test, label=y_test, cat_features=categorical_features_indices),type="ShapValues",)
expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]
shap.initjs()  
 
shap.force_plot(expected_value, shap_values[5, :], X_test.iloc[5, :])

* Representing the row at position 5 of test data
*  Features that are pink contribute to the model output being higher = predicting a success of the Wine-Points prediction
* Features that are blue indicate a lower model output= a failed project
* In this particular situation, the price of the wine being 15 is the most informative feature of the model(biggest block is price)

Visualizing Row using a force plot

In [None]:
shap_values = model.get_feature_importance(Pool(X_test, label=y_test, cat_features=categorical_features_indices),type="ShapValues",)
expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]
shap.initjs()  
 
shap.force_plot(expected_value, shap_values[130, :], X_test.iloc[130, :])

* taster_name is the most informative feature in this situation

**Feature Engineering**

Feature Utility Metric: measuring association between a feature and the target

Load Dataset

In [None]:
plt.style.use("seaborn-whitegrid")

df = data.copy()
df.head()
df.isnull().sum()

In [None]:
X = df.copy()
y = X.pop("points")

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

Computes MI(Mutual Information=measures a relationship between two features) scores for our features and returns a dataframe

Target=points(real-valued)

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores# show a few features with their MI scores


Barplot to visualize the comparasions

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)