In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier as SklearnRFC
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from google.colab import drive
import copy
from typing import Tuple, Any, Optional, Union, List
from collections import Counter
import random
import math
import pickle
import re

## Data preprocessing
Note that I did NOT include SelectKBest here.

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

# 1 Load Data
drive.mount('/content/drive')
datapath = "/content/drive/MyDrive/csc311_project/cleaned_data_combined_modified.csv"
og_df = pd.read_csv(datapath)

Mounted at /content/drive


In [3]:
df = copy.deepcopy(og_df)
def extract_numeric(value):
    if pd.isnull(value):
        return None
    value = str(value).strip().lower()
    value = re.sub(r'[^\d\.\-]', ' ', value)
    value = re.sub(r'\s+', ' ', value).strip()

    if '-' in value:
        numbers = [float(num) for num in value.split('-') if num.strip().isdigit()]
        if numbers:
            return sum(numbers) / len(numbers)

    match = re.search(r'\d+(\.\d+)?', value)
    return float(match.group()) if match else None

numerical_columns = [
    "Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",
    "Q2: How many ingredients would you expect this food item to contain?",
    "Q4: How much would you expect to pay for one serving of this food item?"
]

for col in numerical_columns:
    df[col] = df[col].apply(extract_numeric)

df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

text_cols = ["Q3: In what setting would you expect this food to be served? Please check all that apply",
             "Q5: What movie do you think of when thinking of this food item?",
             "Q6: What drink would you pair with this food item?",
             "Q7: When you think about this food item, who does it remind you of?"]

df[text_cols] = df[text_cols].fillna("none").astype(str).apply(lambda x: x.str.lower().str.strip())

vectorizer_q3 = CountVectorizer(binary=True)
vectorizer_q5 = CountVectorizer(binary=True)
vectorizer_q6 = CountVectorizer(binary=True)
vectorizer_q7 = CountVectorizer(binary=True)

Q3_bow = vectorizer_q3.fit_transform(df["Q3: In what setting would you expect this food to be served? Please check all that apply"])
Q5_bow = vectorizer_q5.fit_transform(df["Q5: What movie do you think of when thinking of this food item?"])
Q6_bow = vectorizer_q6.fit_transform(df["Q6: What drink would you pair with this food item?"])
Q7_bow = vectorizer_q7.fit_transform(df["Q7: When you think about this food item, who does it remind you of?"])

df_q3_bow = pd.DataFrame(Q3_bow.toarray(), columns=[f"Q3_{word}" for word in vectorizer_q3.get_feature_names_out()])
df_q5_bow = pd.DataFrame(Q5_bow.toarray(), columns=[f"Q5_{word}" for word in vectorizer_q5.get_feature_names_out()])
df_q6_bow = pd.DataFrame(Q6_bow.toarray(), columns=[f"Q6_{word}" for word in vectorizer_q6.get_feature_names_out()])
df_q7_bow = pd.DataFrame(Q7_bow.toarray(), columns=[f"Q7_{word}" for word in vectorizer_q7.get_feature_names_out()])

df = pd.concat([df, df_q3_bow, df_q5_bow, df_q6_bow, df_q7_bow], axis=1)

df.drop(columns=text_cols, inplace=True)

hot_sauce_map = {
    "A little (mild)": "Mild",
    "A moderate amount (medium)": "Medium",
    "A lot (hot)": "Hot",
    "I will have some of this food item with my hot sauce": "Medium"
}

df["Q8_cleaned"] = df["Q8: How much hot sauce would you add to this food item?"].map(hot_sauce_map)
df["Q8_cleaned"].fillna("None", inplace=True)
df = pd.get_dummies(df, columns=["Q8_cleaned"])
df.drop(columns=["Q8: How much hot sauce would you add to this food item?"], inplace=True)

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Q8_cleaned"].fillna("None", inplace=True)


Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q4: How much would you expect to pay for one serving of this food item?,Label,Q3_at,Q3_day,Q3_dinner,Q3_late,Q3_lunch,...,Q7_friends,Q7_none,Q7_parents,Q7_siblings,Q7_strangers,Q7_teachers,Q8_cleaned_Hot,Q8_cleaned_Medium,Q8_cleaned_Mild,Q8_cleaned_None
0,716549,3.0,6.0,5.0,Pizza,1,1,0,1,1,...,1,0,0,0,0,0,False,False,True,False
1,715742,4.0,6.033034,5.0,Pizza,1,1,0,1,1,...,1,0,0,0,1,1,False,False,False,True
2,727333,3.0,5.0,10.0,Pizza,0,1,1,0,1,...,1,0,0,0,0,0,False,True,False,False
3,606874,4.0,6.5,3.0,Pizza,1,1,1,1,1,...,1,0,0,1,0,1,False,True,False,False
4,505318,2.0,3.0,5.0,Pizza,1,1,1,0,1,...,1,0,0,1,0,0,False,False,True,False


In [4]:
df = df.dropna()
for col in df.columns: # make all categorical (integers or strings or bools)
    if df[col].dtype == 'float64':
        df[col] = df[col].astype(int)
df.head()

Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q4: How much would you expect to pay for one serving of this food item?,Label,Q3_at,Q3_day,Q3_dinner,Q3_late,Q3_lunch,...,Q7_friends,Q7_none,Q7_parents,Q7_siblings,Q7_strangers,Q7_teachers,Q8_cleaned_Hot,Q8_cleaned_Medium,Q8_cleaned_Mild,Q8_cleaned_None
0,716549,3,6,5,Pizza,1,1,0,1,1,...,1,0,0,0,0,0,False,False,True,False
1,715742,4,6,5,Pizza,1,1,0,1,1,...,1,0,0,0,1,1,False,False,False,True
2,727333,3,5,10,Pizza,0,1,1,0,1,...,1,0,0,0,0,0,False,True,False,False
3,606874,4,6,3,Pizza,1,1,1,1,1,...,1,0,0,1,0,1,False,True,False,False
4,505318,2,3,5,Pizza,1,1,1,0,1,...,1,0,0,1,0,0,False,False,True,False


In [6]:
# 2 Separate Features and Labels
X = df.drop(columns=["Label"])
y = df["Label"]

# 3 Identify Numeric and Text Columns
numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

# 4 Apply TF-IDF Transformation
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

# 5 Min-Max Scaling for Numeric Features
# if numeric_columns:
#     scaler = MinMaxScaler()
#     df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
#     X_final = pd.concat([df_tfidf, df_scaled], axis=1)
# else:
#     X_final = df_tfidf
X_final = pd.concat([df_tfidf, X[numeric_columns]], axis=1)
X_final.head()


Unnamed: 0,Q8_cleaned_Hot,Q8_cleaned_Medium,Q8_cleaned_Mild,Q8_cleaned_None,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q4: How much would you expect to pay for one serving of this food item?,Q3_at,Q3_day,...,Q6_yogurt,Q6_you,Q6_yuzu,Q6_zero,Q7_friends,Q7_none,Q7_parents,Q7_siblings,Q7_strangers,Q7_teachers
0,0.0,0.0,1.0,0.0,716549,3,6,5,1,1,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,1.0,715742,4,6,5,1,1,...,0,0,0,0,1,0,0,0,1,1
2,0.0,1.0,0.0,0.0,727333,3,5,10,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0.0,1.0,0.0,0.0,606874,4,6,3,1,1,...,0,0,0,0,1,0,0,1,0,1
4,0.0,0.0,1.0,0.0,505318,2,3,5,1,1,...,0,0,0,0,1,0,0,1,0,0


In [7]:
print(X_final['Q8_cleaned_Hot'].dtype)

float64


In [8]:
X_final = X_final.dropna() # need to do this or there will be bools instead of 0s and 1s
for col in X_final.columns: # make all categorical (integers or strings or bools)
    if X_final[col].dtype == 'float64':
        X_final[col] = X_final[col].astype(int)
X_final.head()

Unnamed: 0,Q8_cleaned_Hot,Q8_cleaned_Medium,Q8_cleaned_Mild,Q8_cleaned_None,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q4: How much would you expect to pay for one serving of this food item?,Q3_at,Q3_day,...,Q6_yogurt,Q6_you,Q6_yuzu,Q6_zero,Q7_friends,Q7_none,Q7_parents,Q7_siblings,Q7_strangers,Q7_teachers
0,0,0,1,0,716549,3,6,5,1,1,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,715742,4,6,5,1,1,...,0,0,0,0,1,0,0,0,1,1
2,0,1,0,0,727333,3,5,10,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,606874,4,6,3,1,1,...,0,0,0,0,1,0,0,1,0,1
4,0,0,1,0,505318,2,3,5,1,1,...,0,0,0,0,1,0,0,1,0,0


In [9]:
# 7 Train-Test Split (Stratified Split)
X_train, X_test, y_train, y_test = sk_train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
print(type(X_train), type(X_test), type(y_train), type(y_test))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [11]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

In [12]:
print(type(X_test))

<class 'numpy.ndarray'>


In [13]:
X_train

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 1, 1],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# How to Use the RFC
Import the RandomForestClassifier from RFC.py. Make sure it is in the same directory or you will need to change the import structure.

In [14]:
from RFC import RandomForestClassifier

## Training
To train the RFC, use the following calls and hyperparameters. **Don**'t set random_state, it decreases the accuracy considerably.

In [41]:
# training the imported RFC on the cleaned and preprocessed data matrix above
custom_rf = RandomForestClassifier(n_estimators=500, min_samples_split=10, max_samples=1.0)
custom_rf.fit(X_train, y_train)
custom_rf_preds = custom_rf.predict(X_test)

Compare to sklearn's

In [42]:
# test sklearn's RandomForestClassifier
sklearn_rf = SklearnRFC(n_estimators=500, min_samples_split=10, max_samples=1.0)
sklearn_rf.fit(X_train, y_train)  # Sklearn models can work with numpy arrays directly
sklearn_rf_preds = sklearn_rf.predict(X_test)

# compare the accuracies
custom_rf_accuracy = accuracy_score(y_test, custom_rf_preds)
sklearn_rf_accuracy = accuracy_score(y_test, sklearn_rf_preds)

# print the results
print(f"Custom Random Forest Accuracy: {custom_rf_accuracy:.4f}")
print(f"Sklearn Random Forest Accuracy: {sklearn_rf_accuracy:.4f}")


Custom Random Forest Accuracy: 0.8320
Sklearn Random Forest Accuracy: 0.8887


To *save* a trained RFC, use the following syntax:

In [43]:
custom_rf.save("RFC_pretrained.pkl")

Success! RFC exported to RFC_pretrained.pkl.


To *import* a pretrained RFC model, make the following calls:

In [46]:
# initialize the new RFC
new_rfc = RandomForestClassifier(n_estimators=500, min_samples_split=10, max_samples=1.0)
new_rfc.load_pretrained("RFC_pretrained.pkl")

Success! Pre-trained RFC loaded from RFC_pretrained.pkl.


To run predictions on the pretrained model, use the following call. Note that in our script, the test data matrix should have the same columns as the training one.

In [47]:
new_rfc_preds = new_rfc.predict(X_test)

Comparison with sklearn just as a sanity check

In [48]:
# compare to sklearn
# Compare the accuracies
new_rf_accuracy = accuracy_score(y_test, new_rfc_preds)
sklearn_rf_accuracy = accuracy_score(y_test, sklearn_rf_preds)

# Print the results
print(f"Custom Random Forest Accuracy: {new_rf_accuracy:.4f}")
print(f"Sklearn Random Forest Accuracy: {sklearn_rf_accuracy:.4f}")

Custom Random Forest Accuracy: 0.8320
Sklearn Random Forest Accuracy: 0.8887
