In [1]:
import pandas as pd
import numpy as np

print("Radi!")

Radi!


In [2]:
path = "dataset/flavorsofcocoa.csv"
df = pd.read_csv(path, encoding="cp1252")

df.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,2542,5150,U.S.A.,2021,India,"Anamalai, batch 1",68%,"3- B,S,C","milk brownie, macadamia,chewy",3.5
4,2546,5150,U.S.A.,2021,Uganda,"Semuliki Forest, batch 1",80%,"3- B,S,C","mildly bitter, basic cocoa, fatty",3.25


In [None]:
from sklearn.model_selection import train_test_split


df["Cocoa Percent"] = (
    df["Cocoa Percent"].astype(str).str.replace("%", "", regex=False)
)
df["Cocoa Percent"] = pd.to_numeric(df["Cocoa Percent"], errors="coerce")
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")

df["Company (Manufacturer)"] = df["Company (Manufacturer)"].fillna("Unknown")
df["Company Location"] = df["Company Location"].fillna("Unknown")
df["Country of Bean Origin"] = df["Country of Bean Origin"].fillna("Unknown")
df["Specific Bean Origin or Bar Name"] = df["Specific Bean Origin or Bar Name"].fillna("")
df["Ingredients"] = df["Ingredients"].fillna("")
df["Most Memorable Characteristics"] = df["Most Memorable Characteristics"].fillna("")

df = df.dropna(subset=["Cocoa Percent", "Rating"]).copy()

X = df[[
    "Company (Manufacturer)",
    "Company Location",
    "Country of Bean Origin",
    "Cocoa Percent",
    "Specific Bean Origin or Bar Name",
    "Ingredients",
    "Most Memorable Characteristics",
]].copy()

X = X.rename(columns={
    "Company (Manufacturer)": "company",
    "Company Location": "company_location",
    "Country of Bean Origin": "bean_origin",
    "Cocoa Percent": "cocoa_percent",
    "Specific Bean Origin or Bar Name": "bar_name",
    "Ingredients": "ingredients",
    "Most Memorable Characteristics": "characteristics",
})

X["text_all"] = (X["bar_name"] + " " + X["ingredients"] + " " + X["characteristics"]).str.strip()

y = df["Rating"].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X[["company","company_location","bean_origin","cocoa_percent","text_all"]],
    y,
    test_size=0.2,
    random_state=42
)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_validate

categorical = ["company", "company_location", "bean_origin"]
numeric = ["cocoa_percent"]
text_col = "text_all"

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("txt", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=30000), text_col),
    ],
    remainder="passthrough"
)

models = {
    "Ridge": Ridge(alpha=5.0, random_state=42),
    "Lasso": Lasso(alpha=0.001, random_state=42, max_iter=20000),
    "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.2, random_state=42, max_iter=20000),
    "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

scoring = {
    "mae": "neg_mean_absolute_error",
    "rmse": "neg_root_mean_squared_error",
    "r2": "r2"
}

results = []

for name, mdl in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", mdl)
    ])
    cv = cross_validate(pipe, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1)
    results.append({
        "model": name,
        "MAE": -cv["test_mae"].mean(),
        "RMSE": -cv["test_rmse"].mean(),
        "R2": cv["test_r2"].mean()
    })

results_df = pd.DataFrame(results).sort_values("MAE")
results_df

Unnamed: 0,model,MAE,RMSE,R2
2,ElasticNet,0.259855,0.33432,0.406292
0,Ridge,0.264851,0.339911,0.386699
3,RandomForest,0.272245,0.351058,0.345695
4,GradientBoosting,0.278726,0.351974,0.342536
1,Lasso,0.282199,0.358361,0.317478
