In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from path import Path
import numpy as np

In [2]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report

from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load data
file_path=Path("chocolate_version2.csv")
chocolate_df = pd.read_csv(file_path)

chocolate_df.head()

Unnamed: 0,company,company_location,review_date,country_of_bean_origin,cocoa_percent,rating,counts_of_ingredients,cocoa_butter,vanilla,lecithin,salt,sugar,sweetener_without_sugar
0,5150,U.S.A,2019,Madagascar,76.0,4.0,3,have_cocoa_butter,have_not_vanila,have_not_lecithin,have_not_salt,have_sugar,have_not_sweetener_without_sugar
1,5150,U.S.A,2019,Dominican republic,76.0,4.0,3,have_cocoa_butter,have_not_vanila,have_not_lecithin,have_not_salt,have_sugar,have_not_sweetener_without_sugar
2,5150,U.S.A,2019,Tanzania,76.0,3.0,3,have_cocoa_butter,have_not_vanila,have_not_lecithin,have_not_salt,have_sugar,have_not_sweetener_without_sugar
3,A. Morin,France,2012,Peru,63.0,4.0,4,have_cocoa_butter,have_not_vanila,have_lecithin,have_not_salt,have_sugar,have_not_sweetener_without_sugar
4,A. Morin,France,2012,Bolivia,70.0,4.0,4,have_cocoa_butter,have_not_vanila,have_lecithin,have_not_salt,have_sugar,have_not_sweetener_without_sugar


In [4]:
chocolate_df = chocolate_df[chocolate_df.rating > 1] 

In [5]:
# Function to count the number of null values in a column
def count_null_values(dataset, column_list):
    for i in range (len(column_list)):
        print ("The total number of null values in :",column_list[i])
        print (dataset[column_list[i]].isnull().sum())
    return

# Function to dispplay the unique counts in a column
def print_uniques(dataset, column_list):
    for i in range (len(column_list)):
        print ("Unique values for the column:",column_list[i])
        print (dataset[column_list[i]].unique())
        print ('\n')
    return

# Printing the null and unique values for each attribute in the dataset
print_uniques(chocolate_df, chocolate_df.columns)
count_null_values(chocolate_df, chocolate_df.columns)
#print len(data['Rating'].unique().tolist())

Unique values for the column: company
['5150' 'A. Morin' 'Acalli' 'Adi aka Fijiana (Easy In Ltd)' 'Aelan'
 'Aequare (Gianduja)' 'Ah Cacao' "Akesson's (Pralus)" 'Alain Ducasse'
 'Alexandre' 'Altus aka Cao Artisan' 'Amano' 'Amatller (Simon Coll)'
 'Amazona' 'Ambrosia' 'Amedei' 'AMMA' 'Anahata' 'Animas' 'Ara' 'Arete'
 'Argencove' 'Artisan du Chocolat' 'Artisan du Chocolat (Casa Luker)'
 'Askinosie' 'Atypic' 'Auro' 'Bahen & Co.' 'Baiani' 'Bakau' 'Bankston'
 'Bar Au Chocolat' "Baravelli's" 'Batch' 'Bean' 'Beau Cacao' 'Beehive'
 'Belcolade' 'Bellflower' 'Belvie' 'Belyzium' 'Benns' 'Benoit Nihant'
 'Bernachon' 'Beschle (Felchlin)' 'Bisou' 'Bitacora' 'Bittersweet Origins'
 'Bixby' 'Black Mountain' 'Black River (A. Morin)' 'Blanxart'
 'Blue Bandana' 'Boho' 'Bonaterra' 'Bonnat' 'Bouga Cacao (Tulicorp)'
 'Box Chocolate' 'Brasstown' "Brasstown aka It's Chocolate" 'Brazen'
 'Breeze Mill' 'Bright' 'Britarev' 'Bronx Grrl Chocolate' 'Bullion'
 'Burnt Fork Bend' 'By Cacao' 'Cacai Cacao' 'Cacao 70' 'Cac

In [6]:
chocolate_df.dtypes

company                     object
company_location            object
review_date                  int64
country_of_bean_origin      object
cocoa_percent              float64
rating                     float64
counts_of_ingredients        int64
cocoa_butter                object
vanilla                     object
lecithin                    object
salt                        object
sugar                       object
sweetener_without_sugar     object
dtype: object

In [7]:
# Changing the type for review_date from int to object
chocolate_df['review_date'] = chocolate_df['review_date'].astype(str)
chocolate_df['rating'] = chocolate_df['rating'].astype(str)
print(chocolate_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2223 entries, 0 to 2223
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   company                  2223 non-null   object 
 1   company_location         2223 non-null   object 
 2   review_date              2223 non-null   object 
 3   country_of_bean_origin   2223 non-null   object 
 4   cocoa_percent            2223 non-null   float64
 5   rating                   2223 non-null   object 
 6   counts_of_ingredients    2223 non-null   int64  
 7   cocoa_butter             2223 non-null   object 
 8   vanilla                  2223 non-null   object 
 9   lecithin                 2223 non-null   object 
 10  salt                     2223 non-null   object 
 11  sugar                    2223 non-null   object 
 12  sweetener_without_sugar  2223 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 243.1+ KB
None


In [8]:
# Normalizing the columns with integer type

# Data Normalizing

sc = StandardScaler()

# Only the columns with integer and float type values are normalized
norm = chocolate_df.select_dtypes(exclude=['object'])
chocolate_df[norm.columns] = sc.fit(norm).transform(norm)

# Getting information of the dataset after normalization
print (chocolate_df.head(10))
print (chocolate_df[norm.columns].mean(axis= 0))
print (chocolate_df.info())

    company company_location review_date country_of_bean_origin  \
0      5150            U.S.A        2019             Madagascar   
1      5150            U.S.A        2019     Dominican republic   
2      5150            U.S.A        2019               Tanzania   
3  A. Morin           France        2012                   Peru   
4  A. Morin           France        2012                Bolivia   
5  A. Morin           France        2013              Venezuela   
6  A. Morin           France        2013                   Peru   
7  A. Morin           France        2013                Ecuador   
8  A. Morin           France        2013                   Peru   
9  A. Morin           France        2013                 Brazil   

   cocoa_percent rating  counts_of_ingredients       cocoa_butter  \
0       0.853849    4.0              -0.080867  have_cocoa_butter   
1       0.853849    4.0              -0.080867  have_cocoa_butter   
2       0.853849    3.0              -0.080867  have_co

In [9]:
cat_columns = ['company', 'company_location', 'review_date', 'country_of_bean_origin','cocoa_butter', 'vanilla', 'lecithin', 'salt', 'sugar', 'sweetener_without_sugar']
X = pd.get_dummies(chocolate_df, columns = cat_columns).drop('rating', axis=1)
y = chocolate_df['rating']
X.head(10)


Unnamed: 0,cocoa_percent,counts_of_ingredients,company_5150,company_A. Morin,company_AMMA,company_Acalli,company_Adi aka Fijiana (Easy In Ltd),company_Aelan,company_Aequare (Gianduja),company_Ah Cacao,...,vanilla_have_not_vanila,vanilla_have_vanila,lecithin_have_lecithin,lecithin_have_not_lecithin,salt_have_not_salt,salt_have_salt,sugar_have_not_sugar,sugar_have_sugar,sweetener_without_sugar_have_not_sweetener_without_sugar,sweetener_without_sugar_have_sweetener_without_sugar
0,0.853849,-0.080867,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,1,0
1,0.853849,-0.080867,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,1,0
2,0.853849,-0.080867,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,1,0
3,-1.609133,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
4,-0.282912,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
5,-0.282912,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
6,-1.609133,-0.080867,0,1,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,1,0
7,-0.282912,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
8,-0.282912,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
9,-0.282912,0.995583,0,1,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0


In [10]:
y.value_counts()

3.0    1170
4.0     855
2.0     198
Name: rating, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating random forest classifier.
rf_model = RandomForestClassifier(n_estimators=150, random_state=78) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using testing data
predictions = rf_model.predict(X_test)
predictions

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Creating  a DF  from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=[ "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2","Predicted 3", "Predicted 4" ])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

#  Balanced Random Forest

In [None]:
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit
brfc.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

# Easy Ensemble Data Boost

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
# Instantiate
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fit
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2", "Prediced 3", "Predicted 4"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, pred)

# Creating  a DF  from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=[ "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2","Predicted 3", "Predicted 4" ])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, pred)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, pred))

# KNN with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)


In [None]:
y_resampled.value_counts()

In [None]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_resampled,y_resampled)

In [None]:
predictions = knn.predict(X_resampled)
predictions

In [None]:
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_resampled, predictions)

# Creating  a DF  from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=[ "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2","Predicted 3", "Predicted 4" ])

cm_df

# SVM

In [1]:
from sklearn import svm

In [None]:
classifier = svm.SVC(kernel = "linear", C = 1)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Creating  a DF  from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=[ "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 2","Predicted 3", "Predicted 4" ])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

In [None]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred))