In [1]:
#Loading required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#Reading the excel file in a dataframe
df = pd.read_csv("chocolate.csv",error_bad_lines=False,warn_bad_lines=True)

In [2]:
df.describe()

Unnamed: 0,REF,Review Date,Rating
count,1500.0,1500.0,1500.0
mean,1036.881333,2012.326,3.183167
std,554.693828,2.933734,0.484319
min,5.0,2006.0,1.0
25%,572.0,2010.0,2.75
50%,1071.0,2013.0,3.25
75%,1503.0,2015.0,3.5
max,1952.0,2017.0,5.0


In [3]:
df.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Valrhona,Manjari,129,2007,64%,France,4.0,"Criollo, Trinitario",Madagascar
1,Original Beans (Felchlin),"Grand Cru Blend No.1, 5 yr. Anniversary Ed",1442,2014,80%,Switzerland,3.25,Blend,
2,Potomac,Upala w/ nibs,647,2011,70%,U.S.A.,3.5,Matina,Costa Rica
3,Middlebury,"Matagalpa, Cacao Bisiesto",1538,2015,70%,U.S.A.,3.5,,Nicaragua
4,Carlotta Chocolat,Cesar,1888,2016,65%,Colombia,3.5,CCN51,Colombia


In [4]:
df.rename(columns = {df.columns[0]:'Company',
                     'Specific Bean Origin\nor Bar Name':'SBO_BN',
                     'Review\nDate':'Review_Date',
                     'Cocoa\nPercent':'Cocoa_Percent',
                     'Company\nLocation':'Company_Location',
                     'Broad Bean\nOrigin':'BBO',
                     'Bean\nType':'Bean_Type'
                    },inplace=True)


In [5]:
df.columns

Index([u'Company', u'SBO_BN', u'REF', u'Review_Date', u'Cocoa_Percent',
       u'Company_Location', u'Rating', u'Bean_Type', u'BBO'],
      dtype='object')

df.head()

In [10]:
#Iterating over categorical columns to find how many unique values they have.
for column in df[['Company','SBO_BN','Review_Date','Company_Location','Bean_Type','BBO']]:
    print(df[column].value_counts().head(10))
    print("====================================")


Soma             39
Bonnat           23
Fresco           21
Pralus           20
A. Morin         19
Domori           19
Guittard         18
Arete            18
Coppeneur        17
Mast Brothers    17
Name: Company, dtype: int64
Madagascar            42
Peru                  37
Ecuador               37
Dominican Republic    33
Venezuela             18
Sambirano             17
Chuao                 16
Ocumare               15
Papua New Guinea      14
Ghana                 12
Name: SBO_BN, dtype: int64
2015    238
2014    212
2016    180
2012    156
2013    151
2011    137
2009    108
2010     95
2008     76
2007     64
Name: Review_Date, dtype: int64
U.S.A.         632
France         124
Canada         108
U.K.            79
Italy           59
Ecuador         47
Australia       46
Belgium         33
Switzerland     29
Germany         27
Name: Company_Location, dtype: int64
                        750
Trinitario              342
Criollo                 128
Forastero                67
Fora

In [9]:
df.describe(include='all')

Unnamed: 0,Company,SBO_BN,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,BBO
count,1500,1500,1500.0,1500.0,1500,1500,1500.0,1499.0,1499
unique,391,893,,,45,57,,36.0,93
top,Soma,Madagascar,,,70%,U.S.A.,,,Venezuela
freq,39,42,,,568,632,,750.0,174
mean,,,1036.881333,2012.326,,,3.183167,,
std,,,554.693828,2.933734,,,0.484319,,
min,,,5.0,2006.0,,,1.0,,
25%,,,572.0,2010.0,,,2.75,,
50%,,,1071.0,2013.0,,,3.25,,
75%,,,1503.0,2015.0,,,3.5,,


<h3>Data Cleaning</h3>

1. Removing Percentage from Cocoa_Percent

In [11]:
def remove_percentage(l):
    temp = l.split('%')
    return np.float32(temp[0])

df['Cocoa_Percent']= df['Cocoa_Percent'].apply(lambda l: remove_percentage(l))

df.Cocoa_Percent.head(3)

0    64.0
1    80.0
2    70.0
Name: Cocoa_Percent, dtype: float64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
Company             1500 non-null object
SBO_BN              1500 non-null object
REF                 1500 non-null int64
Review_Date         1500 non-null int64
Cocoa_Percent       1500 non-null float64
Company_Location    1500 non-null object
Rating              1500 non-null float64
Bean_Type           1499 non-null object
BBO                 1499 non-null object
dtypes: float64(2), int64(2), object(5)
memory usage: 105.5+ KB


In [33]:
df[df['Bean_Type'].isnull()]

Unnamed: 0,Company,SBO_BN,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,BBO
177,Soma,"Three Amigos(Chuao, Wild Bolivia, D.R.)",676,2011,70.0,Canada,4.0,,"Ven, Bolivia, D.R."


In [34]:
df[df['BBO'].isnull()]

Unnamed: 0,Company,SBO_BN,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,BBO
750,Mast Brothers,Madagascar,999,2012,72.0,U.S.A.,2.5,Trinitario,


In [42]:
unspecified=df.Bean_Type.values[3]

def nan_conversion(n):
    if n == unspecified:
        return np.nan
    else:
        return n
    
for col in df.columns:
    if df[col].dtype == 'O':
        df[col]=df[col].apply(lambda n: nan_conversion(n))

In [43]:
df.isnull().sum()

Company               0
SBO_BN                0
REF                   0
Review_Date           0
Cocoa_Percent         0
Company_Location      0
Rating                0
Bean_Type           751
BBO                  62
dtype: int64

In [47]:
df.Rating.value_counts()

1500