In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, classification_report


In [38]:
# Load training data
train_data = pd.read_csv('../data/train.csv')

# Combine titles and ingredients into a single text feature for each recipe
train_texts = train_data['titre'] + " " + train_data['ingredients']
train_labels = train_data['type']

# Load test data
test_data = pd.read_csv('../data/test.csv')
test_texts = test_data['titre'] + " " + test_data['ingredients']
test_labels = test_data['type']

In [39]:

# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)  # Use the same encoder to ensure consistency

In [40]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)

In [41]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
X_train = pad_sequences(train_sequences, maxlen=100)

In [42]:
test_sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(test_sequences, maxlen=100)

In [44]:
# Build the RNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    SimpleRNN(64),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [45]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [46]:
model.fit(X_train, train_labels_encoded, epochs=5, batch_size=32)

Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6149 - loss: 0.7955
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7649 - loss: 0.4719
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8581 - loss: 0.3375
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9152 - loss: 0.2292
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9514 - loss: 0.1399


<keras.src.callbacks.history.History at 0x71fbb01f0e80>

In [54]:
# Make predictions on the test data
test_predictions = model.predict(X_test)
test_predictions_classes = np.argmax(test_predictions, axis=1)

# Generate the confusion matrix
conf_matrix = confusion_matrix(test_labels_encoded, test_predictions_classes)
class_report = classification_report(test_labels_encoded, test_predictions_classes)

print(conf_matrix)
print(class_report)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[[402   5   0]
 [ 13 125 199]
 [ 11  98 535]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       407
           1       0.55      0.37      0.44       337
           2       0.73      0.83      0.78       644

    accuracy                           0.77      1388
   macro avg       0.74      0.73      0.73      1388
weighted avg       0.75      0.77      0.75      1388

