In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import plotly.express as px

<IPython.core.display.Javascript object>

Data source: https://www.kaggle.com/volpatto/coffee-quality-database-from-cqi?select=merged_data_cleaned.csv

In [3]:
coffee_data = pd.read_csv("coffee_merged_data_cleaned.csv")
coffee_data.columns

Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
       'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
       'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

<IPython.core.display.Javascript object>

In [4]:
coffee_data.drop(columns=["Unnamed: 0", "Number.of.Bags"], inplace=True)
coffee_data_numeric = coffee_data.select_dtypes(include="number").copy()
coffee_data_numeric.columns

Index(['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Category.Two.Defects', 'altitude_low_meters', 'altitude_high_meters',
       'altitude_mean_meters'],
      dtype='object')

<IPython.core.display.Javascript object>

In [5]:
coffee_data_numeric.isnull().sum() / coffee_data_numeric.isnull().count()

Aroma                   0.000000
Flavor                  0.000000
Aftertaste              0.000000
Acidity                 0.000000
Body                    0.000000
Balance                 0.000000
Uniformity              0.000000
Clean.Cup               0.000000
Sweetness               0.000000
Cupper.Points           0.000000
Total.Cup.Points        0.000000
Moisture                0.000000
Category.One.Defects    0.000000
Quakers                 0.000747
Category.Two.Defects    0.000000
altitude_low_meters     0.171770
altitude_high_meters    0.171770
altitude_mean_meters    0.171770
dtype: float64

<IPython.core.display.Javascript object>

In [6]:
coffee_data_numeric.drop(
    columns=["altitude_low_meters", "altitude_high_meters", "altitude_mean_meters"],
    inplace=True,
)

coffee_data_numeric.isnull().sum() / coffee_data_numeric.isnull().count()

Aroma                   0.000000
Flavor                  0.000000
Aftertaste              0.000000
Acidity                 0.000000
Body                    0.000000
Balance                 0.000000
Uniformity              0.000000
Clean.Cup               0.000000
Sweetness               0.000000
Cupper.Points           0.000000
Total.Cup.Points        0.000000
Moisture                0.000000
Category.One.Defects    0.000000
Quakers                 0.000747
Category.Two.Defects    0.000000
dtype: float64

<IPython.core.display.Javascript object>

In [8]:
feature_cols = coffee_data_numeric.columns
feature_cols

Index(['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Category.Two.Defects'],
      dtype='object')

<IPython.core.display.Javascript object>

In [12]:
X = coffee_data[feature_cols].copy()

<IPython.core.display.Javascript object>

In [13]:
X.dropna(inplace=True)
X

Unnamed: 0,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Category.Two.Defects
0,8.67,8.83,8.67,8.75,8.50,8.42,10.00,10.00,10.00,8.75,90.58,0.12,0,0.0,0
1,8.75,8.67,8.50,8.58,8.42,8.42,10.00,10.00,10.00,8.58,89.92,0.12,0,0.0,1
2,8.42,8.50,8.42,8.42,8.33,8.42,10.00,10.00,10.00,9.25,89.75,0.00,0,0.0,0
3,8.17,8.58,8.42,8.42,8.50,8.25,10.00,10.00,10.00,8.67,89.00,0.11,0,0.0,2
4,8.25,8.50,8.25,8.50,8.42,8.33,10.00,10.00,10.00,8.58,88.83,0.12,0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334,7.75,7.58,7.33,7.58,5.08,7.83,10.00,10.00,7.75,7.83,78.75,0.00,0,0.0,1
1335,7.50,7.67,7.75,7.75,5.17,5.25,10.00,10.00,8.42,8.58,78.08,0.00,0,0.0,0
1336,7.33,7.33,7.17,7.42,7.50,7.17,9.33,9.33,7.42,7.17,77.17,0.00,0,0.0,6
1337,7.42,6.83,6.75,7.17,7.25,7.00,9.33,9.33,7.08,6.92,75.08,0.10,20,0.0,1


<IPython.core.display.Javascript object>

In [17]:
scale = StandardScaler()
X_scale = scale.fit_transform(X)
umap_model = umap.UMAP(n_components=2, n_neighbors=20, min_dist=0.09)
components = pd.DataFrame(
    umap_model.fit_transform(X_scale), columns=["0", "1"], index=X.index
)
px.scatter(data_frame=components, x="0", y="1")

<IPython.core.display.Javascript object>

In [18]:
from sklearn.cluster import DBSCAN

dbs = DBSCAN()
components["cluster_dbscan"] = dbs.fit_predict(components)
px.scatter(data_frame=components, x="0", y="1", color="cluster_dbscan")

<IPython.core.display.Javascript object>

In [38]:
clustered = pd.concat([components, X], axis=1)
px.scatter(data_frame=clustered, x="0", y="1", color="cluster_dbscan")

<IPython.core.display.Javascript object>

In [43]:
clustered.groupby("cluster_dbscan").mean().style.background_gradient()

Unnamed: 0_level_0,0,1,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Cupper.Points,Total.Cup.Points,Moisture,Category.One.Defects,Quakers,Category.Two.Defects
cluster_dbscan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
-1,9.477287,3.973255,7.42,7.67,7.83,7.75,7.92,7.83,10.0,10.0,10.0,7.83,84.25,0.12,3.0,0.0,0.0
0,10.408122,5.045129,7.542434,7.488996,7.361978,7.50783,7.489584,7.478256,9.799351,9.788306,9.897556,7.457586,81.812211,0.110578,0.573022,0.018256,3.825558
1,1.409844,2.928312,7.644086,7.615875,7.526654,7.620934,7.602529,7.658755,9.927743,9.966109,9.938171,7.640311,83.141946,0.002218,0.214008,0.097276,2.544747
2,8.768878,-1.804967,7.564545,7.536136,7.402045,7.513636,7.524545,7.484773,9.939091,9.954318,9.893864,7.533864,82.346364,0.095682,0.272727,3.818182,4.568182
3,9.631056,-0.949022,7.548571,7.539524,7.369524,7.56381,7.484286,7.484286,10.0,10.0,10.0,7.534762,82.52381,0.106667,0.047619,1.0,3.0
4,5.423493,8.061379,7.727241,7.695172,7.618276,7.726552,7.710345,7.678621,9.93069,9.953793,7.575172,7.763793,81.37931,0.071034,0.206897,0.0,2.413793


<IPython.core.display.Javascript object>