In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Get Data

In [57]:
init_data = pd.read_csv("./merged_data.csv")

In [58]:
select_columns = ['Total.Cup.Points', 'Species','Country.of.Origin','Processing.Method', 'Aroma', 'Flavor', 'Aftertaste','Acidity','Body', 'Balance', 'Uniformity', 'Moisture', 'altitude_mean_meters']

In [59]:
init_data  = init_data[select_columns]

In [60]:
init_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Total.Cup.Points      1339 non-null   float64
 1   Species               1339 non-null   object 
 2   Country.of.Origin     1338 non-null   object 
 3   Processing.Method     1169 non-null   object 
 4   Aroma                 1339 non-null   float64
 5   Flavor                1339 non-null   float64
 6   Aftertaste            1339 non-null   float64
 7   Acidity               1339 non-null   float64
 8   Body                  1339 non-null   float64
 9   Balance               1339 non-null   float64
 10  Uniformity            1339 non-null   float64
 11  Moisture              1339 non-null   float64
 12  altitude_mean_meters  1109 non-null   float64
dtypes: float64(10), object(3)
memory usage: 136.1+ KB


# Processing Data / Preparation 

In [61]:
init_data['Species'].unique()

array(['Arabica', 'Robusta'], dtype=object)

In [62]:
init_data['Country.of.Origin'].unique()

array(['Ethiopia', 'Guatemala', 'Brazil', 'Peru', 'United States',
       'United States (Hawaii)', 'Indonesia', 'China', 'Costa Rica',
       'Mexico', 'Uganda', 'Honduras', 'Taiwan', 'Nicaragua',
       'Tanzania, United Republic Of', 'Kenya', 'Thailand', 'Colombia',
       'Panama', 'Papua New Guinea', 'El Salvador', 'Japan', 'Ecuador',
       'United States (Puerto Rico)', 'Haiti', 'Burundi', 'Vietnam',
       'Philippines', 'Rwanda', 'Malawi', 'Laos', 'Zambia', 'Myanmar',
       'Mauritius', 'Cote d?Ivoire', nan, 'India'], dtype=object)

In [63]:
init_data['Processing.Method'].unique()

array(['Washed / Wet', nan, 'Natural / Dry', 'Pulped natural / honey',
       'Semi-washed / Semi-pulped', 'Other'], dtype=object)

In [67]:
def find_null(data) :
    null_data = data[data.isnull().any(axis=1)]
    return null_data

In [69]:
find_null(init_data)

Unnamed: 0,Total.Cup.Points,Species,Country.of.Origin,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
2,89.75,Arabica,Guatemala,,8.42,8.50,8.42,8.42,8.33,8.42,10.00,0.00,1700.0
5,88.83,Arabica,Brazil,Natural / Dry,8.58,8.42,8.42,8.50,8.25,8.33,10.00,0.11,
6,88.75,Arabica,Peru,Washed / Wet,8.42,8.50,8.33,8.50,8.25,8.25,10.00,0.11,
7,88.67,Arabica,Ethiopia,,8.25,8.33,8.50,8.42,8.33,8.50,10.00,0.03,1635.0
8,88.42,Arabica,Ethiopia,,8.67,8.67,8.58,8.42,8.33,8.42,9.33,0.03,1635.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,80.17,Robusta,India,,7.67,7.67,7.50,7.33,7.58,7.50,10.00,0.00,750.0
1334,78.75,Robusta,Ecuador,,7.75,7.58,7.33,7.58,5.08,7.83,10.00,0.00,
1335,78.08,Robusta,Ecuador,,7.50,7.67,7.75,7.75,5.17,5.25,10.00,0.00,40.0
1337,75.08,Robusta,India,Natural / Dry,7.42,6.83,6.75,7.17,7.25,7.00,9.33,0.10,


In [70]:
init_data = init_data.dropna()

In [73]:
init_data['Country.of.Origin'].unique()

array(['Ethiopia', 'United States', 'China', 'Costa Rica', 'Mexico',
       'Brazil', 'Uganda', 'Taiwan', 'Kenya', 'Thailand', 'Colombia',
       'Panama', 'Guatemala', 'Papua New Guinea', 'El Salvador',
       'Indonesia', 'Tanzania, United Republic Of', 'Honduras', 'Japan',
       'Nicaragua', 'Ecuador', 'United States (Puerto Rico)', 'Haiti',
       'Burundi', 'Vietnam', 'Philippines', 'Rwanda', 'Malawi', 'Laos',
       'Zambia', 'Myanmar', 'Cote d?Ivoire', 'Peru', 'India'],
      dtype=object)

In [74]:
init_data['Processing.Method'].unique()

array(['Washed / Wet', 'Natural / Dry', 'Pulped natural / honey',
       'Semi-washed / Semi-pulped', 'Other'], dtype=object)

In [75]:
init_data.describe()

Unnamed: 0,Total.Cup.Points,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
count,1013.0,1013.0,1013.0,1013.0,1013.0,1013.0,1013.0,1013.0,1013.0,1013.0
mean,82.115577,7.567897,7.514294,7.386772,7.527196,7.505656,7.500622,9.869516,0.094689,1813.965837
std,2.626653,0.300706,0.323467,0.332248,0.307302,0.272702,0.340476,0.444271,0.043684,9067.972349
min,59.83,5.08,6.17,6.17,5.25,6.33,6.08,6.0,0.0,1.0
25%,81.17,7.42,7.33,7.17,7.33,7.33,7.33,10.0,0.1,1100.0
50%,82.42,7.58,7.5,7.42,7.5,7.5,7.5,10.0,0.11,1310.64
75%,83.58,7.75,7.67,7.58,7.67,7.67,7.67,10.0,0.12,1600.0
max,90.58,8.75,8.83,8.67,8.75,8.5,8.58,10.0,0.17,190164.0


# Split X and Y

In [99]:
Y = init_data['Total.Cup.Points']
X = init_data.iloc[:-1,1:]

In [100]:
X.head()

Unnamed: 0,Species,Country.of.Origin,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
0,Arabica,Ethiopia,Washed / Wet,8.67,8.83,8.67,8.75,8.5,8.42,10.0,0.12,2075.0
1,Arabica,Ethiopia,Washed / Wet,8.75,8.67,8.5,8.58,8.42,8.42,10.0,0.12,2075.0
3,Arabica,Ethiopia,Natural / Dry,8.17,8.58,8.42,8.42,8.5,8.25,10.0,0.11,2000.0
4,Arabica,Ethiopia,Washed / Wet,8.25,8.5,8.25,8.5,8.42,8.33,10.0,0.12,2075.0
9,Arabica,Ethiopia,Natural / Dry,8.08,8.58,8.5,8.5,7.67,8.42,10.0,0.1,1822.5


In [113]:
std = StandardScaler()

In [116]:
numberic_col = ['Aroma' ,'Flavor' ,'Aftertaste' ,'Acidity' ,'Balance' ,'Uniformity' ,'Moisture' ,'altitude_mean_meters']

In [131]:
x_std= std.fit_transform(X[numberic_col])
x_outlier_checking = pd.DataFrame(x_std,columns=numberic_col)
x_outlier_checking.head()

Unnamed: 0,Aroma,Flavor,Aftertaste,Acidity,Balance,Uniformity,Moisture,altitude_mean_meters
0,3.665407,4.067607,3.862429,3.979048,2.700574,0.292717,0.578609,0.028676
1,3.93153,3.572886,3.350655,3.425813,2.700574,0.292717,0.578609,0.028676
2,2.002137,3.294605,3.10982,2.905122,2.20104,0.292717,0.349161,0.020405
3,2.26826,3.047245,2.598047,3.165468,2.436115,0.292717,0.578609,0.028676
4,1.702748,3.294605,3.350655,3.165468,2.700574,0.292717,0.119712,0.00083


In [132]:
x_outlier_checking.describe()

Unnamed: 0,Aroma,Flavor,Aftertaste,Acidity,Balance,Uniformity,Moisture,altitude_mean_meters
count,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0
mean,1.246039e-15,6.502093e-15,-4.934459e-15,3.847987e-15,2.176728e-15,-5.778864e-15,-7.468773e-16,1.785462e-17
std,1.000494,1.000494,1.000494,1.000494,1.000494,1.000494,1.000494,1.000494
min,-8.276872,-4.157128,-3.663657,-7.411067,-4.175367,-8.717379,-2.174772,-0.200043
25%,-0.4927684,-0.5704017,-0.6532226,-0.6420848,-0.5023216,0.2927168,0.1197122,-0.07884642
50%,0.03947801,-0.04476075,0.09938598,-0.08885061,-0.002787449,0.2927168,0.3491606,-0.05561727
75%,0.6049898,0.4808802,0.5810555,0.4643835,0.4967467,0.2927168,0.5786091,-0.02370696
max,3.93153,4.067607,3.862429,3.979048,3.170724,0.2927168,1.725851,20.77093


In [135]:
fig = px.box(x_outlier_checking)
fig.show()

# Correlation Detect

In [150]:
X_std = x_outlier_checking
X_corr = X_std.corr()

In [156]:
fig = px.imshow(X_corr, text_auto=True ,aspect="auto" )
fig.show()

# Prepare Y 

In [137]:
Bean_Grade = [1,2,3]

In [138]:
rating_pctile = np.percentile( Y, [75, 90])

In [139]:
rating_pctile

array([83.58, 84.58])