In [46]:

import  os,joblib
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, RandomForestClassifier, VotingClassifier,StackingClassifier, BaggingClassifier
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression,Lasso
from sklearn.svm import SVR,SVC 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import  make_scorer, mean_squared_error, accuracy_score,precision_score,get_scorer,classification_report, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings('ignore')


## Data preparation

### Use the dataset from Epicurious collected by HugoDarwood.


In [89]:
df = pd.read_csv('data/epi_r.csv')
df.shape

(20052, 680)


### Filter the columns:
the less non-ingredient columns in your dataset the better. You will predict the rating or rating category using only the ingredients and nothing else.

In [90]:
black_list = [
    "calories", "protein", "fat", "sodium", "#cakeweek", "#wasteless",
    "22-minute meals", "3-ingredient recipes", "30 days of groceries", "advance prep required", "breakfast", "brunch", "dinner",
    "lunch", "dessert", "appetizer", "snack", "snack week", "side", "quick & easy", "quick and healthy",
    "advance prep required", "kid-friendly", "vegan", "vegetarian", "pescatarian", "low cal", "low fat",
    "low sugar", "low/no sugar", "low carb", "low cholesterol", "low sodium", "fat free", "pot pie",
    "sugar conscious", "healthy", "high fiber", "dairy free", "peanut free", "tree nut free", "soy free",
    "kosher", "kosher for passover", "wheat/gluten-free", "no sugar added", "non-alcoholic", "alcoholic", "organic",
    "tested & improved", "no-cook", "freeze/chill", "freezer food", "raw", "grill", "grill/barbecue",
    "broil", "roast", "bake", "pan-fry", "deep-fry", "fry", "simmer","fruit","braise",
    "steam", "stir-fry", "slow cooker", "pressure cooker", "smoker", "microwave", "food processor",
    "blender", "potato salad", "double boiler", "ice cream machine", "juicer", "mandoline", "mortar and pestle",
    "candy thermometer", "cookbook critic", "paleo", "cook like a diner", "house cocktail", "epi loves the microwave", "epi + ushg",
    "sandwich theory", "friendsgiving", "family reunion", "anniversary", "birthday", "wedding", "engagement party",
    "party", "picnic", "buffet", "potluck", "tailgating", "cocktail party", "super bowl",
    "thanksgiving", "christmas", "christmas eve", "new year's eve", "new year's day", "meatball", "meatloaf",
    "shower", "ramekin", "valentine's day", "st. patrick's day", "halloween", "hanukkah", "rosh hashanah/yom kippur",
    "purim", "sukkot", "passover", "shavuot", "diwali", "kwanzaa", "ramadan",
    "easter", "fourth of july", "bastille day", "mardi gras", "lunar new year", "oscars", "father's day",
    "winter", "flaming hot summer", "back to school", "backyard bbq", "entertaining", "casserole/gratin", "pasta maker",
    "pressure cooker", "sandwich","sourdough", "washington, d.c.", "lasagna", "hot drink",
    "house & garden", "quiche", "frittata", "fritter", "soufflé/meringue", "salad",
    "salad dressing", "pizza", "cupcake", "cake", "cookie", "cookies", "brownie",
    "muffin", "biscuit", "waffle", "pancake", "taco", "burrito", "omelet","stew","soup/stew",
    "smoothie", "ice cream", "sorbet", "crêpe", "cobbler/crumble", "custard",
    "candy", "marshmallow", "edible gift", "hors d'oeuvre", "condiment", "condiment/spread", "salsa",
    "sauce", "rub", "marinade", "marinate", "pickles", "macaroni and cheese", "parade",
    "suzanne goin", "dorie greenspan", "dip", "drink", "drinks", "aperitif", "digestif",
    "cocktail", "spirit", "connecticut", "dallas", "denver", "stuffing/dressing", "spring",
    "summer", "fall", "graduation", "pasadena", "boston", "beverly hills", "nancy silverton",
    "oktoberfest", "poker/game night", "pie", "providence", "france", "italy", "spain",
    "germany", "switzerland", "australia", "england", "santa monica", "kansas city", "kentucky derby",
    "kidney friendly", "kitchen olympics", "idaho", "hummus", "minneapolis", "mixer", "no meat, no problem",
    "persian new year", "sauté", "self", "ireland", "israel", "japan", "egypt",
    "canada", "haiti", "jamaica", "mexico", "peru", "philippines", "bulgaria",
    "cuba", "dominican republic", "georgia", "guam", "arizona", "alabama", "alaska",
    "california", "colorado", "mother's day", "one-pot meal", "punch", "skewer", "florida",
    "hawaii", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
    "maine", "maryland", "massachusetts", "labor day", "michigan", "minnesota", "mississippi",
    "missouri", "nebraska", "new hampshire", "new jersey", "new mexico", "new orleans", "cambridge",
    "new york", "bon appétit", "harpercollins", "gourmet", "frankenrecipe", "frozen dessert", "game",
    "bon app��tit", "north carolina", "ohio", "weelicious", "chill", "oklahoma", "oregon",
    "pennsylvania", "rhode island", "south carolina", "tennessee", "texas", "utah", "vermont",
    "virginia", "washington", "west virginia", "wisconsin", "las vegas", "los angeles", "brooklyn",
    "atlanta", "columbus", "anthony bourdain", "houston", "emeril lagasse", "miami", "long beach",
    "san francisco", "seattle", "portland", "pittsburgh", "louisville", "windsor", "westwood",
    "lancaster", "healdsburg", "yonkers", "paris", "aspen", "costa mesa", "pacific palisades",
    "st. louis", "cookbooks", "leftovers"
]
df = df.drop(black_list,axis=1)

In [91]:
x = df.drop(['title', 'rating'], axis=1)
y = df['rating']
X_train, X_test, y_train,  y_test = train_test_split(x,y,random_state=21)

## Regression

Try different algorithms and their hyperparameters for rating
prediction.

Choose the best on cross-validation and find the score (RMSE) on the test
subsample.

In [5]:
param_grid_lr = {
    "linearregression__fit_intercept": [True, False]
}
param_grid_ridge = {
    "ridge__alpha": [0.01, 0.1, 1, 10],
    "ridge__fit_intercept": [True, False],
    "ridge__random_state": [21]
}
param_grid_lasso = {
    "lasso__alpha": [0.01, 0.1, 1, 10],
    "lasso__fit_intercept": [True, False],
    "lasso__random_state": [21]
}
param_grid_tree = {
    "max_depth": [1, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "random_state": [21]
}
param_grid_svr = {
    "svr__kernel": ['linear', 'rbf'],
    "svr__C": [0.1, 1, 10],
    "svr__gamma": ['scale', 'auto']
}

regressors_and_params = [
    ("LinearRegression", make_pipeline(StandardScaler(), LinearRegression()), param_grid_lr),
    ("Ridge", make_pipeline(StandardScaler(), Ridge()), param_grid_ridge),
    ("Lasso", make_pipeline(StandardScaler(), Lasso()), param_grid_lasso),
    ("DecisionTree", DecisionTreeRegressor(), param_grid_tree),
    ("SVR", make_pipeline(StandardScaler(), SVR()), param_grid_svr)
]


In [6]:


def compare_regressors(models_and_params, X_train, y_train, X_test, y_test, cv=5,scoring='neg_root_mean_squared_error'):
    results = []

    for name, model, param_grid in models_and_params:
        grid = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        pred = best_model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test,pred))
        results.append((name, grid.best_params_, rmse))
        print(f"{name}: best_params={grid.best_params_}, RMSE={rmse:.4f}")
    return results


In [7]:
result = compare_regressors(regressors_and_params,X_train,y_train,X_test,y_test)

LinearRegression: best_params={'linearregression__fit_intercept': True}, RMSE=1.2875
Ridge: best_params={'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__random_state': 21}, RMSE=1.2875
Lasso: best_params={'lasso__alpha': 0.01, 'lasso__fit_intercept': True, 'lasso__random_state': 21}, RMSE=1.2792
DecisionTree: best_params={'max_depth': 5, 'min_samples_split': 10, 'random_state': 21}, RMSE=1.2983
SVR: best_params={'svr__C': 1, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}, RMSE=1.3261


In [8]:
result.sort(key=lambda x:x[2])
print(f'Best regressor class {result[0][0]}, hyperparametrs: {result[0][1]}, RMSE = {result[0][2]}')

Best regressor class Lasso, hyperparametrs: {'lasso__alpha': 0.01, 'lasso__fit_intercept': True, 'lasso__random_state': 21}, RMSE = 1.279218937727105


Try different `ensembles` and their `hyperparameters`. Choose the
best on cross-validation and find the score on the test subsample.

In [9]:


param_grid_rf = {'n_estimators':[50,100,200],
                'max_depth':[5,10,20,30]}

param_grid_gb = {"n_estimators": [50, 200],
                "learning_rate": [0.03, 0.05],
                "max_depth": [3, 5, 10],
                "min_samples_split": [2, 10],
                "min_samples_leaf": [1, 5],
                "subsample": [0.4, 1.0],
                "max_features": ["auto", "sqrt", "log2"]}
base_learners_SR = [
    ('ridge', make_pipeline(StandardScaler(), Ridge(alpha=10))),
    ('lasso', make_pipeline(StandardScaler(), Lasso(alpha=0.01))),
    ('tree', DecisionTreeRegressor(max_depth=5, min_samples_split=10))
]

param_grid_SR = {
    "final_estimator": [
        make_pipeline(StandardScaler(), Ridge()),
        make_pipeline(StandardScaler(), Lasso()),
        DecisionTreeRegressor(max_depth=3)
    ],
    "passthrough": [True, False]
}

In [10]:
ensembles_and_params = [('Random Forest', RandomForestRegressor(), param_grid_rf),
                        ('Gradient Boosting', GradientBoostingRegressor(), param_grid_gb), 
                        ('Stacking Regressor',StackingRegressor(estimators=base_learners_SR), param_grid_SR)]

In [11]:
ensembles_result = compare_regressors(ensembles_and_params,X_train,y_train,X_test,y_test)

Random Forest: best_params={'max_depth': 30, 'n_estimators': 200}, RMSE=1.2870
Gradient Boosting: best_params={'learning_rate': 0.05, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}, RMSE=1.2676
Stacking Regressor: best_params={'final_estimator': Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())]), 'passthrough': False}, RMSE=1.2785


In [12]:
ensembles_result.sort(key=lambda x:x[2])
print(f'Best regressor class {ensembles_result[0][0]}, hyperparametrs: {ensembles_result[0][1]}, RMSE = {ensembles_result[0][2]}')

Best regressor class Gradient Boosting, hyperparametrs: {'learning_rate': 0.05, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 1.0}, RMSE = 1.2675945313781003


### Calculate the RMSE for a `naive regressor` that predicts the `average rating`. 


In [13]:
average_rating = [y_train.mean()] * len(y_test)
print(f"Средний рейтинг = {y_train.mean()}")

Средний рейтинг = 3.7156310259990692


In [14]:
naive_regressor_rmse = np.sqrt(mean_squared_error(y_test, average_rating))
print(f"Naive regressor RMSE  = {naive_regressor_rmse}")

Naive regressor RMSE  = 1.3281516046618942


## Classification 

### Binarize the target column by rounding the ratings to the closest integer.
This will be your classes. 

In [92]:
y_train_rounded,y_test_rounded = round(y_train,0).astype(int), round(y_test,0).astype(int),
y_train_rounded.value_counts()

rating
4    9861
5    2065
0    1386
3    1129
2     471
1     127
Name: count, dtype: int64

### Try different algorithms and their hyperparameters for class prediction.

### Choose the best on cross-validation and find the score (accuracy) on the test subsample. 

In [66]:
def compare_classificators(models_and_params, X_train, y_train, X_test, y_test, cv=5, scoring='accuracy'):
    results = []
    for name, model, param_grid in models_and_params:
        grid = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        pred = best_model.predict(X_test)
        if isinstance(scoring, str):
            scorer = get_scorer(scoring)
            metric = scorer(best_model, X_test, y_test)
            metric_name = scoring
        else:
            metric = scoring._score_func(y_test, pred, **scoring._kwargs)
            metric_name = scoring._score_func.__name__
        results.append((name, grid.best_params_, metric))
        print(f"{name}: best_params={grid.best_params_}, {metric_name}={metric:.4f}")
    return results


In [None]:

param_grid_SVC = {'kernel':['linear','rbf'],
               'C':[0.1, 1], 
               'gamma':['scale','auto'], 
               'class_weight' : ['balanced', None],
                'random_state':[21],
                'probability':[True]}
base_learners_SC = [
    ('lr', make_pipeline(StandardScaler(),LogisticRegression(max_iter=1000))),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('svc', make_pipeline(StandardScaler(),SVC(probability=True, kernel='rbf')))
]

param_grid_SC = {
    "final_estimator": [
        make_pipeline(StandardScaler(),LogisticRegression(max_iter=1000)),
        DecisionTreeClassifier(max_depth=3),
        RandomForestClassifier(n_estimators=20)
    ],
    "passthrough": [True, False]
}
param_grid_bagging = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [False, True],
    'estimator': [
        DecisionTreeClassifier(max_depth=3),
        DecisionTreeClassifier(max_depth=5)
    ]
}
param_grid_logreg = {
    'C': [0.1, 1, 10],               
    'penalty': ['l2', 'l1', 'elasticnet', 'none'], 
    'solver': ['lbfgs', 'liblinear', 'saga', 'newton-cg'],
    'max_iter': [100, 200]                   
}
base_learners_VC = [
    ('lr', make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('svc', make_pipeline(StandardScaler(), SVC(probability=True)))
]

param_grid_voting = {
    'voting': ['hard', 'soft'],
    'weights': [
        [1, 1, 1],  
        [2, 1, 1],    
        [1, 2, 1],    
        [1, 1, 2]     
    ]
}


In [8]:
classificators_and_params = [('SVC',SVC(),param_grid_SVC),
                             ('Stacking Classifier',StackingClassifier(base_learners_SC),param_grid_SC),
                             ('Bagging Classifier',BaggingClassifier(),param_grid_bagging),
                             ('Logistic Regression',LogisticRegression(),param_grid_logreg),
                             ('VotingClassifier',VotingClassifier(base_learners_VC), param_grid_voting)
                             ]


In [24]:
classificator_result = compare_classificators(classificators_and_params,X_train.iloc[:5000],y_train_rounded.iloc[:5000],X_test,y_test_rounded)

SVC: best_params={'C': 1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 21}, accuracy=0.6738
Stacking Classifier: best_params={'final_estimator': DecisionTreeClassifier(max_depth=3), 'passthrough': False}, accuracy=0.6631
Bagging Classifier: best_params={'bootstrap': False, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(max_depth=5), 'max_features': 0.7, 'max_samples': 0.7, 'n_estimators': 10}, accuracy=0.6703
Logistic Regression: best_params={'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}, accuracy=0.6703
VotingClassifier: best_params={'voting': 'hard', 'weights': [1, 1, 1]}, accuracy=0.6699


In [36]:
classificator_result.sort(key=lambda x:-x[2])
print(f'Best classificator class {classificator_result[0][0]}, hyperparametrs: {classificator_result[0][1]}, Accuracy = {classificator_result[0][2]}')

Best classificator class SVC, hyperparametrs: {'C': 1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 21}, Accuracy = 0.6738479952124476


In [37]:
most_common = [y_train_rounded.mode()] * len(y_test)
naive_accuracy = accuracy_score(y_test_rounded,most_common)
print(f'Accuracy of Naive model = {naive_accuracy}')

Accuracy of Naive model = 0.6636744464392579


### Binarize the target column again by converting the integers to classes ‘bad’ (0, 1), ‘so-so’ (2, 3), ‘great’ (4, 5). 

In [93]:
bins = [-np.inf,2,4,np.inf]
labels = ['bad','so-so','great']
y_train_labeled = pd.cut(y_train_rounded,bins=bins,labels=labels,right=False)
y_test_labeled = pd.cut(y_test_rounded,bins=bins,labels=labels,right=False)
y_test_labeled.value_counts()

rating
great    3981
so-so     545
bad       487
Name: count, dtype: int64

In [28]:

classificator_result_labels = compare_classificators(classificators_and_params,X_train.iloc[:5000],y_train_labeled.iloc[:5000],X_test,y_test_labeled,cv=3)

SVC: best_params={'C': 1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 21}, accuracy=0.7989
Stacking Classifier: best_params={'final_estimator': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))]), 'passthrough': False}, accuracy=0.7977
Bagging Classifier: best_params={'bootstrap': False, 'bootstrap_features': True, 'estimator': DecisionTreeClassifier(max_depth=5), 'max_features': 1.0, 'max_samples': 0.7, 'n_estimators': 10}, accuracy=0.7983
Logistic Regression: best_params={'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}, accuracy=0.7969
VotingClassifier: best_params={'voting': 'hard', 'weights': [1, 1, 2]}, accuracy=0.7993


In [42]:
classificator_result_labels.sort(key=lambda x:-x[2])
print(f'Best classificator class {classificator_result_labels[0][0]}, hyperparametrs: {classificator_result_labels[0][1]}, Accuracy = {classificator_result_labels[0][2]}')

Best classificator class VotingClassifier, hyperparametrs: {'voting': 'hard', 'weights': [1, 1, 2]}, Accuracy = 0.7993217634151207


In [39]:
most_common_label = [y_train_labeled.mode()] * len(y_test_labeled)
naive_accuracy_labels = accuracy_score(y_test_labeled,most_common_label)
print(f'Accuracy of Naive model = {naive_accuracy_labels}')

Accuracy of Naive model = 0.7941352483542788


## What is worse: 
### to predict a bad rating which is good in real life, or to predict a good rating which is bad in real life? 
### Replace accuracy with the appropriate metric. 

## Answer :
### Хуже предсказать хороший рейтинг , который низкий в реальной жизни (Ложно положительный)
## Необходимо замеить accuracy на precision macro

In [20]:
precision_macro = make_scorer(precision_score, average='macro')


In [21]:
classificator_result_labels_precision = compare_classificators(classificators_and_params,X_train.iloc[:5000],y_train_labeled.iloc[:5000],X_test,y_test_labeled,cv=3,scoring=precision_macro)

SVC: best_params={'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 'random_state': 21}, precision_score=0.3867
Stacking Classifier: best_params={'final_estimator': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))]), 'passthrough': True}, precision_score=0.4315
Bagging Classifier: best_params={'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(max_depth=3), 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 10}, precision_score=0.4988
Logistic Regression: best_params={'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}, precision_score=0.4714
VotingClassifier: best_params={'voting': 'hard', 'weights': [1, 1, 1]}, precision_score=0.5003


In [40]:
classificator_result_labels_precision.sort(key=lambda x:-x[2])
print(f'Best classificator class {classificator_result_labels_precision[0][0]}, hyperparametrs: {classificator_result_labels_precision[0][1]}, precision_macro = {classificator_result_labels_precision[0][2]}')

Best classificator class VotingClassifier, hyperparametrs: {'voting': 'hard', 'weights': [1, 1, 1]}, precision_macro = 0.5003095766875294


In [41]:
naive_precision_labels = precision_score(y_test_labeled,most_common_label,average='macro')
print(f'Precision of Naive model = {naive_precision_labels}')

Precision of Naive model = 0.2647117494514263


## Decision: что лучше использовать — регрессию или классификацию?
### Сравнение результатов
#### Регрессия:

Лучшая модель (Gradient Boosting): RMSE ≈ 1.27

Наивная регрессия (среднее): RMSE ≈ 1.33

Выигрыш по сравнению с наивной: ≈5%

Ошибка (1.27 балла) — большая относительно шкалы рейтинга (от 0 до 5).

#### Классификация (3 класса: “bad”, “so-so”, “great”):

Лучшая accuracy моделей: 0.80

Accuracy наивной модели: 0.73

Лучшая macro-precision: 0.50

Macro-precision наивной модели: 0.24

Модели выигрывают у наивного по macro-precision примерно на 0.36

### Аргументы в пользу классификации
Accuracy и macro-precision у моделей классификации выше, чем у наивной, и прирост значимее, чем у регрессии.

Регрессия в этой задаче не даёт точного результата (ошибка 1.27 — это очень много по 5-балльной шкале), практической ценности мало.




In [52]:
best_precision_params = classificator_result_labels_precision[0][1]
precision_model = make_pipeline(StandardScaler(),VotingClassifier(base_learners_VC,**best_precision_params))
precision_model.fit(X_train,y_train_labeled)
precision_y_pred = precision_model.predict(X_test)


In [48]:
print("Classification Report:")
print(classification_report(y_test_labeled, precision_y_pred, zero_division=1))

print("Confusion Matrix:")
print(confusion_matrix(y_test_labeled, precision_y_pred))



Classification Report:
              precision    recall  f1-score   support

         bad       0.66      0.09      0.15       487
       great       0.80      1.00      0.89      3981
       so-so       1.00      0.00      0.00       545

    accuracy                           0.80      5013
   macro avg       0.82      0.36      0.35      5013
weighted avg       0.81      0.80      0.72      5013

Confusion Matrix:
[[  42  445    0]
 [  18 3963    0]
 [   4  541    0]]


In [96]:
best_accuracy_params = classificator_result_labels[0][1]
accuracy_model = make_pipeline(StandardScaler(),VotingClassifier(base_learners_VC, **best_accuracy_params))
accuracy_model.fit(X_train,y_train_labeled)
accuracy_y_pred = accuracy_model.predict(X_test)


In [51]:
print("Classification Report:")
print(classification_report(y_test_labeled, accuracy_y_pred, zero_division=1))

print("Confusion Matrix:")
print(confusion_matrix(y_test_labeled, accuracy_y_pred))


Classification Report:
              precision    recall  f1-score   support

         bad       0.64      0.10      0.17       487
       great       0.80      0.99      0.89      3981
       so-so       1.00      0.00      0.00       545

    accuracy                           0.80      5013
   macro avg       0.82      0.36      0.35      5013
weighted avg       0.81      0.80      0.72      5013

Confusion Matrix:
[[  47  440    0]
 [  22 3959    0]
 [   4  540    1]]


Результаты обеих моделей (максимальная accuracy и максимальная macro-precision) практически совпадают. Обе модели хорошо предсказывают основной класс (`great`), плохо — редкие (`so-so`, `bad`). Различия в метриках минимальны.

**Вывод:**  
Выбираем модель с максимальной accuracy.  

In [98]:
accuracy_model_accuracy = accuracy_score(y_test_labeled, accuracy_y_pred)
model_filename = f"data/VotingClassifier_{round(accuracy_model_accuracy,5)}.sav"
joblib.dump(accuracy_model,model_filename)

['data/VotingClassifier_0.79932.sav']

Nutrition

In [69]:
import requests


In [70]:
ingredients = list(x.columns)
daily_values = {
    "Vitamin A, RAE": 900 , "Vitamin E (alpha-tocopherol)": 15 * 1000, 'Manganese, Mn': 2.3 * 1000, 'Folate, total': 400 * 1000,
    "Total lipid (fat)": 78 * 1000000, "Fatty acids, total saturated": 20 * 1000000, "Cholesterol": 300 * 1000, 
    "Vitamin D (D2 + D3)": 20 , "Vitamin K (phylloquinone)": 120 , "Calcium, Ca": 1300 * 1000, 
    "Iron, Fe": 18 * 1000, "Thiamin": 1.2 * 1000, "Riboflavin": 1.3 * 1000, "Niacin": 16 * 1000, "Vitamin B-6": 1.7 * 1000, 
    "Folate, DFE": 400 , "Sodium, Na": 2300 * 1000, "Phosphorus, P": 1250 * 1000, "Magnesium, Mg": 420 * 1000, 
    "Zinc, Zn": 11 * 1000, "Copper, Cu": 0.9 * 1000, "Selenium, Se": 55 * 1000, 
    "Carbohydrate, by difference": 275 * 1000000, "Fiber, total dietary": 28 * 1000000, "Choline, total": 2300 * 1000, 
    "Potassium, K": 4700 * 1000, "Protein": 50 * 1000000, "Vitamin B-12, added": 2.4 , "Total Sugars": 50 * 10000000
}

def data_make(ingredients, daily_values):
    api_key = "tbVKCwicj5dE4uZ2L8QAORdd5AmPXTZwcEyvYqVn"
    nutrients_list_result = []
    for item in ingredients:
        search_response = requests.get('https://api.nal.usda.gov/fdc/v1/foods/search?api_key={}&query={}'.format(api_key, item)).json()
        
        if not search_response.get("foods"):
            continue
        
        fdc_id = search_response["foods"][0]["fdcId"]

        food_url = f"https://api.nal.usda.gov/fdc/v1/food/{fdc_id}?api_key={api_key}"
        food_data = requests.get(food_url).json()

        nutrients_list = []
        for nutrient_data in food_data.get("foodNutrients", []):
            try:
                name = nutrient_data["nutrient"]["name"]
                amount = nutrient_data["amount"]
                unit = nutrient_data["nutrient"]["unitName"] 
                if ( unit == 'mg'):
                    amount = amount * 1000
                if ( unit == 'g'):
                    amount = amount * 1000000
                if (name in daily_values.keys()):
                    nutrients_list.append({
                        "name": name,
                        "amount": (amount/daily_values[name]),
                        "unit": unit
                    })
            
            except KeyError as e:
                continue 
            
            except Exception as e:
                continue

        nutrients_list_result.append({item: nutrients_list})

    return  nutrients_list_result

In [103]:
df1 = data_make(ingredients, daily_values)
df2 = pd.DataFrame()


In [106]:

for product_dict in df1:
    product_name = list(product_dict.keys())[0]
    nutrients = product_dict[product_name]
    
    for nutrient in nutrients:
        col_name = f"{nutrient['name']}"
        df2.loc[product_name, col_name] = nutrient['amount']

df2 = df2.fillna(0)
df2.to_csv('data/nutritions_facts.csv')
df2

Unnamed: 0,Protein,Total lipid (fat),"Carbohydrate, by difference",Total Sugars,"Fiber, total dietary","Calcium, Ca","Iron, Fe","Magnesium, Mg","Phosphorus, P","Potassium, K",...,Niacin,Vitamin B-6,"Folate, total","Choline, total",Vitamin K (phylloquinone),"Folate, DFE","Vitamin B-12, added",Cholesterol,"Fatty acids, total saturated","Manganese, Mn"
almond,0.4140,0.675641,0.076727,0.00880,0.342857,0.202308,0.226667,0.633333,0.4032,0.157660,...,0.243125,0.048824,0.000105,0.022522,0.000000,0.1050,0.0,0.000000,0.2110,0.0
amaretto,0.0000,0.128205,0.121200,0.06666,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.1665,0.0
anchovy,0.5780,0.124487,0.000000,0.00000,0.000000,0.178462,0.257222,0.164286,0.2016,0.115745,...,1.243750,0.119412,0.000032,0.036957,0.100833,0.0325,0.0,0.283333,0.1100,0.0
anise,0.3520,0.203846,0.181891,0.00000,0.521429,0.496923,2.053333,0.404762,0.3520,0.306596,...,0.191250,0.382353,0.000025,0.000000,0.000000,0.0250,0.0,0.000000,0.0293,1.0
apple,0.0000,0.008333,0.051964,0.02078,0.114286,0.000000,0.012778,0.000000,0.0000,0.023404,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow squash,0.0200,0.000000,0.010909,0.00400,0.035714,0.007692,0.020000,0.000000,0.0000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0000,0.0
yogurt,0.0704,0.045128,0.016036,0.00616,0.000000,0.101538,0.000000,0.030952,0.0000,0.032766,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.050000,0.1100,0.0
yuca,0.0236,0.000000,0.106945,0.00236,0.125000,0.009231,0.070556,0.000000,0.0000,0.075106,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0000,0.0
zucchini,0.0210,0.000000,0.015309,0.00632,0.039286,0.016154,0.024444,0.000000,0.0000,0.047234,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0000,0.0


Similar Recipes

In [100]:
recipe_urls = []
for _, row in df.iterrows():
    if pd.notna(row['title']):
        recipe_urls.append({
            "title": row["title"],
            "rating": row["rating"],
            "url": f"https://www.epicurious.com/recipes/food/views/{row['title'].replace(' ', '-').lower()}"
        })

AttributeError: 'int' object has no attribute 'replace'

In [None]:
recipe_urls_df = pd.DataFrame(recipe_urls)
recipe_urls_df.to_csv("data/similar_recipes.csv", index=False)