In [10]:
import pandas as pd
from sklearn import linear_model
from sklearn.pipeline import Pipeline
#from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report, accuracy_score

In [11]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [12]:
# Seperate data from labels
# data is what is should learn about
# labels is what it should predict
data = train_data['titre'] + " " + train_data['ingredients']
labels = train_data['type']

In [13]:
# Transform the text to numerical values
# Create an instance of the TfidfVectorizer class 
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', linear_model.LogisticRegression(max_iter=3000)),
])

In [14]:
# Combine title and ingredients into a single text feature for test data
test_data_combined = test_data['titre'] + " " + test_data['ingredients']
test_labels = test_data['type']

In [15]:
# Fit the pipeline to the training data
pipeline.fit(data, labels)

In [16]:
# Predict on the test data
test_predictions = pipeline.predict(test_data_combined)

In [17]:
# Generate evaluation metrics
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro')
conf_matrix = confusion_matrix(test_labels, test_predictions)
class_report = classification_report(test_labels, test_predictions)
accuracy = accuracy_score(test_labels, test_predictions)

In [18]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.845821325648415
Precision: 0.8418236297312877
Recall: 0.823089683269015
F1-Score: 0.8290653835031199
Confusion Matrix:
 [[405   1   1]
 [  5 198 134]
 [  3  70 571]]
Classification Report:
                 precision    recall  f1-score   support

       Dessert       0.98      1.00      0.99       407
        Entrée       0.74      0.59      0.65       337
Plat principal       0.81      0.89      0.85       644

      accuracy                           0.85      1388
     macro avg       0.84      0.82      0.83      1388
  weighted avg       0.84      0.85      0.84      1388



In [None]:
# We can skip max_iter argument here, but it will produce a
# ConvergenceWarning. Therefore we explicity give a bigger value to
# avoid the warning.
# Loop over a range of folds for cross-validation
# Specifying both scoring metrics
#scoring = ['accuracy', 'f1_macro']  

# for depth in range(6, 13):
#     cv_scores = cross_validate(pipeline, data, labels, cv=depth, scoring=scoring)
#     print(f"Folds: {depth}")
#     print("CV Accuracy:", cv_scores['test_accuracy'])
#     print("Average CV Accuracy:", cv_scores['test_accuracy'].mean())
#     print("CV Macro F1 Scores:", cv_scores['test_f1_macro'])
#     print("Average CV Macro F1 Score:", cv_scores['test_f1_macro'].mean())
#     print("###########################")
# 10 folds give us back the best accuracy
