In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.pipeline import Pipeline
from google.colab import drive
import copy
import re

In [2]:
# load data from Google Drive and into a pandas dataframe
drive.mount('/content/drive')
datapath = "/content/drive/MyDrive/csc311_project/cleaned_data_combined_modified.csv"
df = pd.read_csv(datapath)
df.head()

Mounted at /content/drive


Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q3: In what setting would you expect this food to be served? Please check all that apply,Q4: How much would you expect to pay for one serving of this food item?,Q5: What movie do you think of when thinking of this food item?,Q6: What drink would you pair with this food item?,"Q7: When you think about this food item, who does it remind you of?",Q8: How much hot sauce would you add to this food item?,Label
0,716549,3,6,"Week day lunch,At a party,Late night snack",5,Cloudy with a Chance of Meatballs,Coke,Friends,A little (mild),Pizza
1,715742,4,"bread, meet","Week day lunch,At a party,Late night snack",5$ for a large piece,All sort of american young boy movies,Coke,"Friends,Teachers,Strangers",,Pizza
2,727333,3,5,"Week day lunch,Week day dinner,Weekend lunch,W...",10dollar,action movie,cola,Friends,A moderate amount (medium),Pizza
3,606874,4,6-7,"Week day lunch,Week day dinner,Weekend lunch,W...",$3,Mamma Mia,Soda,"Siblings,Friends,Teachers",I will have some of this food item with my hot...,Pizza
4,505318,2,3 or more,"Week day lunch,Week day dinner,Weekend lunch,W...",$5,Cloudy with a chance of meatballs,Soda,"Siblings,Friends",A little (mild),Pizza


In [3]:
# split into X and y
y = df['Label'].copy()
X = df.drop(columns=['Label'])

## Approach 1: one string per response
This is because RFC typically expects a 2D matrix, not a 3D matrix.

In [4]:
X1 = copy.deepcopy(X)
# join all responses into one string
X1['altogether'] = X1.apply(lambda row: ' '.join(row.astype(str)), axis=1)
X1.head()

Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q3: In what setting would you expect this food to be served? Please check all that apply,Q4: How much would you expect to pay for one serving of this food item?,Q5: What movie do you think of when thinking of this food item?,Q6: What drink would you pair with this food item?,"Q7: When you think about this food item, who does it remind you of?",Q8: How much hot sauce would you add to this food item?,altogether
0,716549,3,6,"Week day lunch,At a party,Late night snack",5,Cloudy with a Chance of Meatballs,Coke,Friends,A little (mild),"716549 3 6 Week day lunch,At a party,Late nigh..."
1,715742,4,"bread, meet","Week day lunch,At a party,Late night snack",5$ for a large piece,All sort of american young boy movies,Coke,"Friends,Teachers,Strangers",,"715742 4 bread, meet Week day lunch,At a party..."
2,727333,3,5,"Week day lunch,Week day dinner,Weekend lunch,W...",10dollar,action movie,cola,Friends,A moderate amount (medium),"727333 3 5 Week day lunch,Week day dinner,Week..."
3,606874,4,6-7,"Week day lunch,Week day dinner,Weekend lunch,W...",$3,Mamma Mia,Soda,"Siblings,Friends,Teachers",I will have some of this food item with my hot...,"606874 4 6-7 Week day lunch,Week day dinner,We..."
4,505318,2,3 or more,"Week day lunch,Week day dinner,Weekend lunch,W...",$5,Cloudy with a chance of meatballs,Soda,"Siblings,Friends",A little (mild),"505318 2 3 or more Week day lunch,Week day din..."


In [5]:
X1 = X1.drop(columns=['id', 'Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)', 'Q2: How many ingredients would you expect this food item to contain?',
                      'Q3: In what setting would you expect this food to be served? Please check all that apply', 'Q4: How much would you expect to pay for one serving of this food item?',
                      'Q5: What movie do you think of when thinking of this food item?', 'Q6: What drink would you pair with this food item?', 'Q7: When you think about this food item, who does it remind you of?',
                      'Q8: How much hot sauce would you add to this food item?'])
X1.columns

Index(['altogether'], dtype='object')

In [6]:
def tokenize(text):
    """ Return a list of the words resulting from splitting the string text by spaces,
    commas, semicolons, colons, dashes, and newlines. """
    return re.findall(r'\b\w+\b', text.lower())

In [7]:
X1['tokenized'] = X1['altogether'].apply(tokenize)
X1.head()

Unnamed: 0,altogether,tokenized
0,"716549 3 6 Week day lunch,At a party,Late nigh...","[716549, 3, 6, week, day, lunch, at, a, party,..."
1,"715742 4 bread, meet Week day lunch,At a party...","[715742, 4, bread, meet, week, day, lunch, at,..."
2,"727333 3 5 Week day lunch,Week day dinner,Week...","[727333, 3, 5, week, day, lunch, week, day, di..."
3,"606874 4 6-7 Week day lunch,Week day dinner,We...","[606874, 4, 6, 7, week, day, lunch, week, day,..."
4,"505318 2 3 or more Week day lunch,Week day din...","[505318, 2, 3, or, more, week, day, lunch, wee..."


In [8]:
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False) # already tokenized based on our specifications so
X1_bow = vectorizer.fit_transform(X1['tokenized'])



In [9]:
bow_df = pd.DataFrame(X1_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.head()

Unnamed: 0,0,00,007,02,1,10,100,1001,10dollar,10ish,...,za,zero,zodiac,zohan,zootopia,ナミヤ雑貨店の奇蹟,一休さん,深夜食堂,米饭,紫菜
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 70/30 train-test split

In [10]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_bow, y, test_size=0.3)

Test out the model

In [11]:
# initialize tree
rfc = RandomForestClassifier()

Automatically tune hyperparameters with GridSearch

In [17]:
param_grid = {
    'n_estimators': [50, 100, 200],  # no. trees in forest
    'criterion': ['entropy', 'gini'], # how good is a particular split
    'max_depth': [None, 10, 20],  # max depth of each tree
    'min_samples_split': [2, 5, 15],  # min no. samples to split node
    'min_samples_leaf': [1, 2, 4],  # min samples at a leaf node
    'bootstrap': [True, False]  # bootstrap yes/no (probably yes lol)
}

grid_search = GridSearchCV(estimator=rfc,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation (but we won't implement this, just for exploration)
                           n_jobs=-1,
                           verbose=3,
                           scoring='accuracy')  # use accuracy for evaluation

grid_search.fit(X1_train, y1_train)

best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best hyperparameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


Now train the best performing model and evaluate

In [18]:
best_rfc1 = grid_search.best_estimator_
y1_pred = best_rfc1.predict(X1_test)

# evaluate accuracy
accuracy1 = accuracy_score(y1_test, y1_pred)
print(f"Test accuracy with best hyperparameters: {accuracy1:.4f}")

# print importance of each word
print('Feature importances:', best_rfc1.feature_importances_)

Test accuracy with best hyperparameters: 0.8381
Feature importances: [0.00000000e+00 1.46886670e-03 3.18413860e-04 ... 3.01188828e-05
 1.99858376e-05 0.00000000e+00]


In [28]:
rfc_gini = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=50)
rfc_gini.fit(X1_train, y1_train)
gini_pred = rfc_gini.predict(X1_test)
accuracy_gini = accuracy_score(y1_test, gini_pred)
print(f"Test accuracy with best hyperparameters: {accuracy_gini:.4f}")

Test accuracy with best hyperparameters: 0.8583


## Approach 2: one RFC per feature, combine votes from each RFC
Takes longer but considers each feature independently before combining ideas. Again a workaround because RFC typically expects a 2D matrix.

In [19]:
X2 = copy.deepcopy(X)
X2 = X2.astype(str)

In [20]:
def train_ultramega_rfc(X_train, y_train, X_test):
    """ Train an RFC based on each feature. Split text in the same way as the above, but do it within each feature.
    Collect votes across features and use the maximum vote to make the prediction.
    Also tune hyperparameters automatically :)
    """
    models = []
    predictions = []

    for column in X_train.columns:
        vectorizer = CountVectorizer(tokenizer=tokenize)
        rf_classifier = RandomForestClassifier()

        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('rf', rf_classifier)
        ])

        # use grid search for hyperparameter tuning
        # I originally had more parameters here but I had to re-run it and it was taking
        # too long so I removed some
        param_grid = {
            'rf__n_estimators': [100],
            'rf__criterion': ['gini'],
            'rf__max_depth': [None],
            'rf__min_samples_split': [2, 10]
        }

        grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X_train[column], y_train)

        # save the best model
        best_rf = grid_search.best_estimator_
        models.append(best_rf)

        # make predictions on the test data
        y_pred = best_rf.predict(X_test[column])
        predictions.append(y_pred)

    return models, predictions


### 70/30 train-test split

In [21]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3)
models, predictions = train_ultramega_rfc(X2_train, y2_train, X2_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Aggregate predictions

In [22]:
# convert predictions to a numpy array (each row is predictions for one test sample)
predictions = np.array(predictions)

final_predictions = []
for i in range(predictions.shape[1]):  # for each test sample
    # get the most frequent prediction (mode) for each test sample across all classifiers
    most_frequent_prediction = np.unique(predictions[:, i], return_counts=True)
    final_prediction = most_frequent_prediction[0][np.argmax(most_frequent_prediction[1])]  # mode
    final_predictions.append(final_prediction)

final_predictions = np.array(final_predictions)

In [23]:
accuracy = accuracy_score(y2_test, final_predictions)
print(f"Final Prediction Accuracy: {accuracy:.4f}")

Final Prediction Accuracy: 0.7611


try weighting predictions from each column

In [24]:
def trainon_col(X_train, y_train, X_test, y_test):
    """ Train one RFC on each column (Question), and then return all those predictions. """
    models = []
    predictions = []
    column_scores = []

    for column in X_train.columns:
        vectorizer = CountVectorizer(tokenizer=tokenize)
        rf_classifier = RandomForestClassifier()

        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('rf', rf_classifier)
        ])

        # I originally had more parameters here but I had to re-run it and it was taking
        # too long so I removed some
        param_grid = {
            'rf__n_estimators': [100],
            'rf__criterion': ['gini'],
            'rf__max_depth': [None],
            'rf__min_samples_split': [2, 10]
        }

        grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X_train[column], y_train)

        best_rf = grid_search.best_estimator_
        models.append(best_rf)

        # predict on test data for curr column
        y_pred = best_rf.predict(X_test[column])
        predictions.append(y_pred)

        # store accuracy of rfc for this column
        accuracy = accuracy_score(y_test, y_pred)
        column_scores.append(accuracy)

    return models, predictions, column_scores


In [25]:
# train rfc for each feature
models, predictions, column_scores = trainon_col(X2_train, y2_train, X2_test, y2_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




Fitting 5 folds for each of 2 candidates, totalling 10 fits




In [26]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y2_train)
y_test_encoded = label_encoder.transform(y2_test)

preds = np.array(predictions)

# use column scores as weights
column_weights = np.array(column_scores) / np.sum(column_scores)

# weighted mode, use accuracy to score
weighted_preds = []
for i in range(preds.shape[1]):
    # str -> int
    enc_preds = label_encoder.transform(preds[:, i])
    weighted_vote = np.bincount(enc_preds, weights=column_weights)
    final_pred_idx = np.argmax(weighted_vote) # get index of mode

    # convert int back to OG label
    final_pred = label_encoder.inverse_transform([final_pred_idx])[0]
    weighted_preds.append(final_pred)

final_preds2 = np.array(weighted_preds)

accuracy = accuracy_score(y2_test, final_preds2)
print(f"Final Prediction Accuracy (with weighted voting): {accuracy:.4f}")

Final Prediction Accuracy (with weighted voting): 0.8603
