In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, mean_absolute_error

In [74]:
df = pd.read_csv('data/chocolate_bars.csv')

bc = pd.read_csv('data/bean_continent.csv')
df = pd.merge(df, bc, on=["bean_origin", "bean_origin"])

cc = pd.read_csv('data/company_continent.csv')
df = pd.merge(df, cc, on=["company_location", "company_location"])

df = df.loc[~df['ingredients'].isnull()]

df.year_reviewed = df.year_reviewed/max(df.year_reviewed)

df.reset_index(inplace=True)
df.set_index('index', inplace=True)

df.columns.to_list()

['id',
 'manufacturer',
 'company_location',
 'year_reviewed',
 'bean_origin',
 'bar_name',
 'cocoa_percent',
 'num_ingredients',
 'ingredients',
 'review',
 'rating',
 'bean_origin_continent',
 'company_location_continent']

In [75]:
# keep = ['id', 'manufacturer', 'company_location', 'year_reviewed', 'bean_origin', 'bar_name',
# 'cocoa_percent', 'num_ingredients', 'ingredients',  'review',  'rating',  'bean_origin_continent',
# 'company_location_continent']

keep = ['manufacturer', 'year_reviewed', 'bar_name',
'cocoa_percent', 'num_ingredients', 'ingredients',  'review',  'rating',  'bean_origin_continent',
'company_location_continent']

# keep = ['cocoa_percent', 'ingredients', 'rating']

df = df[keep]

ingredients = {
    'B': 'Beans',
    'S': 'Sugar',
    'S*': 'Sweetner',
    'C': 'Cocoa_Butter',
    'V': 'Vanilla',
    'L': 'Lecithin',
    'Sa': 'Salt'
}
for i in ingredients: 
    df[ingredients[i]] = 0

for idx, row in df.iterrows():
    for r in row['ingredients'].split(','):
        df.loc[idx, ingredients[r]] = 1
        
df['cocoa_percent'] = df['cocoa_percent']/100

df.drop('ingredients', axis=1, inplace=True)

df = pd.get_dummies(df)

df.head()

Unnamed: 0_level_0,year_reviewed,cocoa_percent,num_ingredients,rating,Beans,Sugar,Sweetner,Cocoa_Butter,Vanilla,Lecithin,...,bean_origin_continent_South Pacific,bean_origin_continent_Southeast Asia,company_location_continent_Africa,company_location_continent_Central America,company_location_continent_Central Asia,company_location_continent_Europe,company_location_continent_North America,company_location_continent_South America,company_location_continent_South Pacific,company_location_continent_Southeast Asia
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.99901,0.76,3.0,3.25,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.997526,0.7,2.0,3.75,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.995052,0.72,2.0,3.75,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.997526,0.68,3.0,3.75,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.99901,0.72,3.0,3.25,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [76]:
features = df.columns.tolist()
features.remove('rating')

In [77]:
# Split train data into two parts
test, train = train_test_split(df, test_size=.2, random_state=123)

# Train a Gradient Boosting model on Part 1
gb = GradientBoostingRegressor().fit(train[features], train.rating)

# Train a Random Forest model on Part 1
rf = RandomForestRegressor().fit(train[features], train.rating)

# Make predictions on the test data
test['gb_pred'] = gb.predict(test[features])
test['rf_pred'] = rf.predict(test[features])
test['stacking'] = (test['gb_pred'] + test['rf_pred']) / 2

In [78]:
test['stacking'] = round(test['stacking']*4)/4
test[['rating','stacking']].head(10)

Unnamed: 0_level_0,rating,stacking
index,Unnamed: 1_level_1,Unnamed: 2_level_1
424,3.25,3.5
1064,2.75,3.25
2488,2.75,3.25
2186,3.25,3.5
817,3.5,3.0
1568,4.0,3.5
391,3.25,3.25
1028,2.5,3.0
442,3.0,3.25
200,2.5,3.25


In [79]:
rmse = mean_absolute_error(test['rating'], test['stacking'])
print('Validation RMSE for Baseline I model: {:.3f}'.format(rmse))

Validation RMSE for Baseline I model: 0.324
