In [8]:
import csv
datafile = "/content/drive/MyDrive/University/Courses/CSC311/Project/manual_cleaned_data_universal.csv"

Preprocessing:

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, MinMaxScaler

# Load the cleaned data
df = pd.read_csv(datafile)

# --- Step 0: Identify rows to keep (no NaN in relevant columns) ---
# Define all columns used in the model (numerical, MC, text, and label)
numerical_cols = [
    'Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
    'Q2 Cleaned',
    'Q4 Cleaned'
]
mc_columns = [
    'Q3: In what setting would you expect this food to be served? Please check all that apply',
    'Q7: When you think about this food item, who does it remind you of?',
    'Q8: How much hot sauce would you add to this food item?'
]
text_columns = ['Q5 Cleaned', 'Q6 Cleaned']
label_col = 'Label'

# Combine all columns to check for NaN
columns_to_check = numerical_cols + mc_columns + text_columns + [label_col]

# Identify rows without NaN (keep the original indices)
mask = df[columns_to_check].notna().all(axis=1)
df_clean = df[mask].copy()  # Keep original indices of non-NaN rows

# Extract labels (convert to numpy array)
t = df_clean[label_col].to_numpy()

# --- Step 1: Normalize Numerical Features ---
# Extract numerical features and normalize
scaler = MinMaxScaler()
X_numerical = scaler.fit_transform(df_clean[numerical_cols])

# --- Step 2: One-Hot Encode Multiple-Choice (MC) Columns ---
encoded_mc = []
for col in mc_columns:
    # Split comma-separated answers into lists
    split_data = df_clean[col].str.split(',').apply(lambda x: [item.strip() for item in x])

    # One-hot encode using MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    encoded = mlb.fit_transform(split_data)
    encoded_mc.append(encoded)

# Combine all MC encoded arrays
X_mc = np.hstack(encoded_mc)

# --- Step 3: One-Hot Encode Text-Based Columns ---
encoded_text = []
for col in text_columns:
    # Use OneHotEncoder for text responses
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = ohe.fit_transform(df_clean[[col]])
    encoded_text.append(encoded)

# Combine all text encoded arrays
X_text = np.hstack(encoded_text)

# --- Step 4: Combine All Features into Final Matrix X (as numpy array) ---
X = np.hstack([X_numerical, X_mc, X_text])

print(X[1,:])
#print(t[0])
print(f"Final data matrix X shape: {X.shape}")
print(f"Labels shape: {t.shape}")
print(f"Number of rows removed due to NaN: {len(df) - len(df_clean)}")

[0.5        0.25       0.20454545 0.         0.         1.
 1.         1.         1.         1.         0.         0.
 0.         0.         0.         0.         1.         0.
 0.         0.         0.         1.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

Train-test split:

In [36]:
from sklearn.model_selection import train_test_split
# First, we will use `train_test_split` to split the data set into
# 633 training, and 160 test:
X_tv, X_test, t_tv, t_test = train_test_split(X, t, test_size=160/793, random_state=1)

Sklearn prediction:

In [37]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model = QuadraticDiscriminantAnalysis()
model.fit(X_tv, t_tv)
print(f"Accuracy: {model.score(X_test, t_test)}")

Accuracy: 0.5875


