In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler

##data loaded
data=pd.read_csv('chocolate_bars.csv',index_col=0)
data.head(-5)
##filing the missing datas
#filling the missing tables with the most common ingredients
data['ingredients'].fillna(data['ingredients'].mode()[0], inplace=True)
data['num_ingredients'].fillna(data['num_ingredients'].mode()[0], inplace=True)
#print(data)

# Importing LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Instantiating LabelEncoder
le=LabelEncoder()
# Iterating over all the values of each column and extract their dtypes
for col in data.columns.to_numpy():
    # Comparing if the dtype is object
    if data[col].dtypes in ('object','category'):
    # Using LabelEncoder to do the numeric transformation
        data[col]=le.fit_transform(data[col].astype(str))
        
#Binning the rating column
cut_labels = ['really bad', 'bad', 'ok', 'good']
cut_bins = [0, 0.99,1.99,2.99,4.0]
data['rating'] = pd.cut(data['rating'], bins=cut_bins, labels=cut_labels)


#creating the testing and training variables
x=data.drop("rating",axis=1)
y=data["rating"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
x_train.shape, x_test.shape

# Normalize feature data
scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Saving the feature names 
features = x.columns.tolist()

import time
startTime = time.time()
##Decision tree
dt=DecisionTreeClassifier(max_depth=5,random_state=42,criterion="gini",splitter="best",min_samples_split=10,max_leaf_nodes=15)
dt.fit(x_train,y_train)


# Extracting the importances by sklearn 
importances_sk = dt.feature_importances_
feature_importance_sk = {}
for i, feature in enumerate(features):
    feature_importance_sk[feature] = round(importances_sk[i], 3)
    
print(f"Feature importance by sklearn: {feature_importance_sk}")



#Evaluate the accuracy of the model
y_pred = dt.predict(x_test)
predictions = metrics.accuracy_score(y_test, y_pred)
#Calculating the accuracy in percentage
print('The accuracy is: ', predictions * 100, '%')
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Feature importance by sklearn: {'manufacturer': 0.0, 'company_location': 0.139, 'year_reviewed': 0.086, 'bean_origin': 0.0, 'bar_name': 0.068, 'cocoa_percent': 0.164, 'num_ingredients': 0.0, 'ingredients': 0.152, 'review': 0.39}
The accuracy is:  77.86561264822134 %
Execution time in seconds: 0.046859025955200195


In [11]:
#creating reduced vectors
x=data.drop(columns=['rating', 'manufacturer','bean_origin','num_ingredients'])
y=data["rating"]
print(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
x_train.shape, x_test.shape

      company_location  year_reviewed  bar_name  cocoa_percent  ingredients  \
id                                                                            
2454                62           2019       750           76.0           10   
2458                62           2019      1597           76.0           10   
2454                62           2019       162           76.0           10   
2542                62           2021       935           68.0           10   
2546                62           2021      1418           72.0           10   
...                ...            ...       ...            ...          ...   
1205                 3           2014      1249           80.0            6   
1996                 3           2017        11           75.0           10   
2036                 3           2018       487           75.0           10   
2170                 3           2018      1008           70.0           10   
2170                 3           2018       953     

((1771, 6), (759, 6))

In [12]:
import time
startTime = time.time()
##Decision tree
dt=DecisionTreeClassifier(max_depth=5,random_state=42,criterion="gini",splitter="best",min_samples_split=10,max_leaf_nodes=15)
dt.fit(x_train,y_train)

#Evaluate the accuracy of the model
y_pred = dt.predict(x_test)
predictions = metrics.accuracy_score(y_test, y_pred)
#Calculating the accuracy in percentage
print('The accuracy is: ', predictions * 100, '%')
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

The accuracy is:  77.86561264822134 %
Execution time in seconds: 0.02352142333984375
