<a href="https://colab.research.google.com/github/apoorwa46/FoodRecall-NLP-Research-Project/blob/main/Food_Recall_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ==============================
# STEP 0: Setup
# ==============================
!pip install pandas scikit-learn tensorflow openpyxl joblib

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
import joblib

# ==============================
# STEP 1: Mount Google Drive
# ==============================
from google.colab import drive
drive.mount('/content/drive')

# ==============================
# STEP 2: Load Excel Files
# ==============================
train_file = "/content/drive/MyDrive/FoodRecall/file1.xlsx"
val_file   = "/content/drive/MyDrive/FoodRecall/file2.xlsx"
unseen_file = "/content/drive/MyDrive/FoodRecall/file3.xlsx"

df_train = pd.read_excel(train_file)
df_val = pd.read_excel(val_file)
df_unseen = pd.read_excel(unseen_file)

# Drop first row (duplicate headers inside the file)
df_train = df_train.drop(0).reset_index(drop=True)
df_val = df_val.drop(0).reset_index(drop=True)

print("Training Data:", df_train.shape)
print("Validation Data:", df_val.shape)
print("Unseen Data:", df_unseen.shape)

# ==============================
# STEP 3: Preprocess Data
# ==============================
target_col = "Task1_Label"
text_col = "Unnamed: 1"   # This is the text column

# Features and target
X_train_text = df_train[text_col].astype(str)
y_train = df_train[target_col]

X_val_text = df_val[text_col].astype(str)
y_val = df_val[target_col]

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# Convert text → TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)

# Save encoder + vectorizer
joblib.dump(label_encoder, "/content/drive/MyDrive/FoodRecall/label_encoder.pkl")
joblib.dump(vectorizer, "/content/drive/MyDrive/FoodRecall/tfidf_vectorizer.pkl")

# ==============================
# STEP 4: Build Model
# ==============================
model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(len(np.unique(y_train)), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# ==============================
# STEP 5: Train Model
# ==============================
history = model.fit(
    X_train.toarray(), y_train,
    validation_data=(X_val.toarray(), y_val),
    epochs=10,
    batch_size=32
)

# Save model
model.save("/content/drive/MyDrive/FoodRecall/food_model.h5")

# ==============================
# STEP 6: Evaluate Model
# ==============================
val_pred = model.predict(X_val.toarray())
val_pred_labels = np.argmax(val_pred, axis=1)

print("Validation Accuracy:", accuracy_score(y_val, val_pred_labels))
print("Classification Report:\n", classification_report(y_val, val_pred_labels, target_names=label_encoder.classes_))

# ==============================
# STEP 7: Use Model on Unseen Data
# ==============================
# Use the same text column
X_unseen_text = df_unseen["text"].astype(str) if "text" in df_unseen.columns else df_unseen[text_col].astype(str)

# Apply same vectorizer
X_unseen = vectorizer.transform(X_unseen_text)

# Predict
unseen_pred = model.predict(X_unseen.toarray())
unseen_labels = label_encoder.inverse_transform(np.argmax(unseen_pred, axis=1))

# Attach predictions
df_unseen["Predicted_Label"] = unseen_labels
print(df_unseen.head())

# Save results
df_unseen.to_excel("/content/drive/MyDrive/FoodRecall/unseen_predictions.xlsx", index=False)
print("Predictions saved to Drive ✅")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data: (3172, 9)
Validation Data: (357, 9)
Unseen Data: (1005, 2)




Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8457 - loss: 0.6443 - val_accuracy: 0.9524 - val_loss: 0.1513
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9644 - loss: 0.1143 - val_accuracy: 0.9356 - val_loss: 0.1451
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9825 - loss: 0.0537 - val_accuracy: 0.9440 - val_loss: 0.1434
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9966 - loss: 0.0238 - val_accuracy: 0.9356 - val_loss: 0.1386
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9983 - loss: 0.0096 - val_accuracy: 0.9468 - val_loss: 0.1432
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9997 - loss: 0.0041 - val_accuracy: 0.9468 - val_loss: 0.1566
Epoch 7/10
[1m100/100



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Validation Accuracy: 0.9439775910364145
Classification Report:
                             precision    recall  f1-score   support

               Food Recall       0.97      0.95      0.96       176
Foodborne Disease Outbreak       0.93      0.96      0.94       176
                   Neither       0.00      0.00      0.00         5

                  accuracy                           0.94       357
                 macro avg       0.63      0.64      0.63       357
              weighted avg       0.93      0.94      0.94       357

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
   docid                                               text  \
0     49  The previously reported outbreak of Salmonella...   
1   5147  Two California women are suing Don Antonio's, ...   
2   5002  Oct. 28 update: As of Tuesday night, Oct. 27, ...   
3   1603  Canadian food safety officials are investigati...

In [None]:
# ==============================
# STEP 0: Setup
# ==============================
!pip install pandas scikit-learn openpyxl joblib

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# ==============================
# STEP 1: Mount Google Drive
# ==============================
from google.colab import drive
drive.mount('/content/drive')

# ==============================
# STEP 2: Load Excel Files
# ==============================
train_file = "/content/drive/MyDrive/FoodRecall/file1.xlsx"
val_file   = "/content/drive/MyDrive/FoodRecall/file2.xlsx"
unseen_file = "/content/drive/MyDrive/FoodRecall/file3.xlsx"

df_train = pd.read_excel(train_file)
df_val = pd.read_excel(val_file)
df_unseen = pd.read_excel(unseen_file)

# Drop first row if it's a duplicate header
df_train = df_train.drop(0).reset_index(drop=True)
df_val = df_val.drop(0).reset_index(drop=True)

print("Training Data:", df_train.shape)
print("Validation Data:", df_val.shape)
print("Unseen Data:", df_unseen.shape)

# ==============================
# STEP 3: Preprocess Data
# ==============================
target_col = "Task1_Label"
text_col = "Unnamed: 1"   # The text column

# Features and target
X_train_text = df_train[text_col].astype(str)
y_train = df_train[target_col]

X_val_text = df_val[text_col].astype(str)
y_val = df_val[target_col]

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# Convert text → TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)

# Save encoder + vectorizer
joblib.dump(label_encoder, "/content/drive/MyDrive/FoodRecall/label_encoder.pkl")
joblib.dump(vectorizer, "/content/drive/MyDrive/FoodRecall/tfidf_vectorizer.pkl")

# ==============================
# METHOD 1: Logistic Regression
# ==============================
print("\n=== Training Logistic Regression Model ===")

# Train model
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)

# Evaluate
y_pred_val = log_reg.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val, target_names=label_encoder.classes_))

# Save model
joblib.dump(log_reg, "/content/drive/MyDrive/FoodRecall/logistic_model.pkl")

# ==============================
# STEP 4: Predict on Unseen Data
# ==============================
# Use the same text column as before
X_unseen_text = df_unseen["text"].astype(str) if "text" in df_unseen.columns else df_unseen[text_col].astype(str)

# Transform using same vectorizer
X_unseen = vectorizer.transform(X_unseen_text)

# Predict
unseen_pred = log_reg.predict(X_unseen)
df_unseen["Predicted_Label_LogReg"] = label_encoder.inverse_transform(unseen_pred)

# Save results
df_unseen.to_excel("/content/drive/MyDrive/FoodRecall/unseen_predictions_logreg.xlsx", index=False)
print("✅ Logistic Regression predictions saved successfully!")

# Optional: show sample predictions
print("\nSample predictions:")
print(df_unseen[["Predicted_Label_LogReg"]].head())


Mounted at /content/drive
Training Data: (3172, 9)
Validation Data: (357, 9)
Unseen Data: (1005, 2)

=== Training Logistic Regression Model ===
Validation Accuracy: 0.9523809523809523
Classification Report:
                             precision    recall  f1-score   support

               Food Recall       0.98      0.95      0.97       176
Foodborne Disease Outbreak       0.93      0.98      0.95       176
                   Neither       0.00      0.00      0.00         5

                  accuracy                           0.95       357
                 macro avg       0.64      0.64      0.64       357
              weighted avg       0.94      0.95      0.95       357



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ Logistic Regression predictions saved successfully!

Sample predictions:
       Predicted_Label_LogReg
0  Foodborne Disease Outbreak
1  Foodborne Disease Outbreak
2  Foodborne Disease Outbreak
3                 Food Recall
4  Foodborne Disease Outbreak


In [None]:
# ==============================
# STEP 0: Setup
# ==============================
!pip install pandas scikit-learn tensorflow openpyxl joblib

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import joblib
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# ==============================
# STEP 1: Mount Google Drive
# ==============================
from google.colab import drive
drive.mount('/content/drive')

# ==============================
# STEP 2: Load Excel Files
# ==============================
train_file = "/content/drive/MyDrive/FoodRecall/file1.xlsx"
val_file   = "/content/drive/MyDrive/FoodRecall/file2.xlsx"
unseen_file = "/content/drive/MyDrive/FoodRecall/file3.xlsx"

df_train = pd.read_excel(train_file)
df_val = pd.read_excel(val_file)
df_unseen = pd.read_excel(unseen_file)

# Drop first row if duplicate header
df_train = df_train.drop(0).reset_index(drop=True)
df_val = df_val.drop(0).reset_index(drop=True)

print("Training Data:", df_train.shape)
print("Validation Data:", df_val.shape)
print("Unseen Data:", df_unseen.shape)

# ==============================
# STEP 3: Preprocess Data
# ==============================
target_col = "Task1_Label"
text_col = "Unnamed: 1"   # Text column name

# Features and target
X_train_text = df_train[text_col].astype(str)
y_train = df_train[target_col]

X_val_text = df_val[text_col].astype(str)
y_val = df_val[target_col]

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# Save encoder for reuse
joblib.dump(label_encoder, "/content/drive/MyDrive/FoodRecall/label_encoder.pkl")

# ==============================
# METHOD 2: LSTM Deep Learning
# ==============================
print("\n=== Training LSTM Deep Learning Model ===")

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text)

# Handle unseen data text column safely
if "text" in df_unseen.columns:
    X_unseen_text = df_unseen["text"].astype(str)
else:
    X_unseen_text = df_unseen[text_col].astype(str)

X_unseen_seq = tokenizer.texts_to_sequences(X_unseen_text)

# Pad sequences (same length)
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_unseen_pad = pad_sequences(X_unseen_seq, maxlen=max_len, padding='post')

# Build model
lstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
history = lstm_model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=5,
    batch_size=32
)

# Evaluate
val_pred = lstm_model.predict(X_val_pad)
val_labels = np.argmax(val_pred, axis=1)

print("\nValidation Accuracy:", accuracy_score(y_val, val_labels))
print("Classification Report:\n", classification_report(y_val, val_labels, target_names=label_encoder.classes_))

# Save model and tokenizer
lstm_model.save("/content/drive/MyDrive/FoodRecall/lstm_food_model.h5")
joblib.dump(tokenizer, "/content/drive/MyDrive/FoodRecall/tokenizer.pkl")

# ==============================
# STEP 4: Predict on Unseen Data
# ==============================
unseen_pred = lstm_model.predict(X_unseen_pad)
unseen_labels = label_encoder.inverse_transform(np.argmax(unseen_pred, axis=1))

df_unseen["Predicted_Label_LSTM"] = unseen_labels

# Save results
df_unseen.to_excel("/content/drive/MyDrive/FoodRecall/unseen_predictions_lstm.xlsx", index=False)
print("✅ LSTM model predictions saved successfully!")

# Optional: show sample predictions
print("\nSample predictions:")
print(df_unseen[["Predicted_Label_LSTM"]].head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data: (3172, 9)
Validation Data: (357, 9)
Unseen Data: (1005, 2)

=== Training LSTM Deep Learning Model ===
Epoch 1/5




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 56ms/step - accuracy: 0.4693 - loss: 0.9013 - val_accuracy: 0.5574 - val_loss: 0.7553
Epoch 2/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.5125 - loss: 0.7596 - val_accuracy: 0.7143 - val_loss: 0.6886
Epoch 3/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 56ms/step - accuracy: 0.7828 - loss: 0.5886 - val_accuracy: 0.8711 - val_loss: 0.3835
Epoch 4/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 66ms/step - accuracy: 0.8889 - loss: 0.3530 - val_accuracy: 0.8627 - val_loss: 0.3701
Epoch 5/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - accuracy: 0.8985 - loss: 0.3321 - val_accuracy: 0.8627 - val_loss: 0.3835
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Validation Accuracy: 0.8627450980392157
Classification Report:
                             precision    recall  f1-score   support

               Food Recall       0.96      0.78      0.86       176
Foodborne Disease Outbreak       0.80      0.97      0.87       176
                   Neither       0.00      0.00      0.00         5

                  accuracy                           0.86       357
                 macro avg       0.59      0.58      0.58       357
              weighted avg       0.87      0.86      0.86       357

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
✅ LSTM model predictions saved successfully!

Sample predictions:
         Predicted_Label_LSTM
0  Foodborne Disease Outbreak
1  Foodborne Disease Outbreak
2  Foodborne Disease Outbreak
3                 Food Recall
4  Foodborne Disease Outbreak
