File reading:

In [2]:
DATASET_PATH = "/content/drive/MyDrive/University/Courses/CSC311/Project/manual_cleaned_data_universal.csv"

kNN imputation with Gower distance:

In [6]:
import pandas as pd
import numpy as np

def gower_distance(row1, row2, num_cols, cat_cols, ranges):
    """Calculate Gower distance between two rows"""
    distances = []
    valid_features = 0

    # Numerical features
    for col in num_cols:
        if not (pd.isna(row1[col]) or pd.isna(row2[col])):
            diff = abs(row1[col] - row2[col])
            norm_diff = diff / ranges[col]
            distances.append(norm_diff)
            valid_features += 1

    # Categorical features
    for col in cat_cols:
        if not (pd.isna(row1[col]) or pd.isna(row2[col])):
            distances.append(0 if row1[col] == row2[col] else 1)
            valid_features += 1

    return sum(distances) / valid_features if valid_features > 0 else np.inf

def knn_impute(df, num_cols, cat_cols, k=5):
    """Custom kNN imputation using Gower distance"""
    # Calculate numerical ranges
    ranges = {col: df[col].max() - df[col].min() for col in num_cols}

    # Create copy to impute
    imputed_df = df.copy()

    # Find rows with missing values
    missing_rows = imputed_df[imputed_df.isnull().any(axis=1)].index

    for idx in missing_rows:
        target_row = imputed_df.loc[idx]
        distances = []

        # Calculate distances to all complete rows
        for other_idx, other_row in imputed_df.dropna().iterrows():
            dist = gower_distance(target_row, other_row, num_cols, cat_cols, ranges)
            distances.append((other_idx, dist))

        # Get k-nearest neighbors
        distances.sort(key=lambda x: x[1])
        neighbors = imputed_df.loc[[d[0] for d in distances[:k]]]

        # Impute missing values
        for col in imputed_df.columns:
            if pd.isna(target_row[col]):
                if col in num_cols:
                    # Impute mean for numerical
                    imputed_value = neighbors[col].mean()
                else:
                    # Impute mode for categorical
                    imputed_value = neighbors[col].mode()[0] if not neighbors[col].mode().empty else np.nan

                imputed_df.at[idx, col] = imputed_value

    return imputed_df


Preprocessing:


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


# def process_intervals(x: str):
#     if x.isdigit():
#         return x
#     interval = x.split('-')
#     return interval[0]


# def min_max_scaling(series: pd.Series):
#     return (series - series.min()) / (series.max() - series.min())


def preprocess(file_path, normalize_and_onehot=False, mode="full", df_in=None):
    # If a DataFrame is provided, use it; otherwise read from file_path.
    if df_in is not None:
        df = df_in.copy()
    else:
        df = pd.read_csv(file_path, dtype=str)

    # Define columns
    num_cols = ['Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
                'Q2 Cleaned', 'Q4 Cleaned']
    cat_cols = ['Q3: In what setting would you expect this food to be served? Please check all that apply',
                'Q5 Cleaned', 'Q6 Cleaned',
                'Q7: When you think about this food item, who does it remind you of?',
                'Q8: How much hot sauce would you add to this food item?',
                'Label']

    # Convert numerical cols to numeric
    df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

    # Perform imputation
    imputed_df = knn_impute(df, num_cols, cat_cols, k=5)

    # Rename "Q5: What movie do you think of when thinking of this food item?","Q6: What drink would you pair with this food item?"
    # to "Q5 Cleaned" and "Q6 Cleaned"
    df.rename(columns={
        "Q2: How many ingredients would you expect this food item to contain?": "Q2 Cleaned",
        "Q4: How much would you expect to pay for one serving of this food item?": "Q4 Cleaned",
        "Q5: What movie do you think of when thinking of this food item?": "Q5 Cleaned",
        "Q6: What drink would you pair with this food item?": "Q6 Cleaned"
    }, inplace=True)
    # Convert all columns to string
    df = df.astype(str)

    # Record initial row count before dropping missing values
    initial_rows = len(df)

    # Drop rows with invalid values (N/A)
    df.replace("N/A", pd.NA, inplace=True)
    df.dropna(inplace=True)
    dropped_rows = initial_rows - len(df)
    print(f"Dropped {dropped_rows} rows out of {initial_rows} due to missing values.")

    # Bag-of-Words for Q5 and Q6
    vectorizer_q5 = CountVectorizer(max_features=100)  # Limit to top 100 features
    vectorizer_q6 = CountVectorizer(max_features=50)   # Limit to top 50 features

    bow_q5 = pd.DataFrame(vectorizer_q5.fit_transform(df["Q5 Cleaned"]).toarray(),
                            columns=vectorizer_q5.get_feature_names_out())
    bow_q6 = pd.DataFrame(vectorizer_q6.fit_transform(df["Q6 Cleaned"]).toarray(),
                            columns=vectorizer_q6.get_feature_names_out())
    print(f"Shape of bow_q5: {bow_q5.shape}")
    print(f"Shape of bow_q6: {bow_q6.shape}")

    if mode == "full":
        # Process numerical features
        numerical_cols = ["Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",
                          "Q2 Cleaned", "Q4 Cleaned"]
        for col in numerical_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=numerical_cols, inplace=True)

        scaler = MinMaxScaler()
        normalized_numerical = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols)
        print(f"Shape of normalized numerical features: {normalized_numerical.shape}")

        # One-hot encode all categorical features (including Label)
        categorical_cols = ["Q3: In what setting would you expect this food to be served? Please check all that apply",
                            "Q7: When you think about this food item, who does it remind you of?",
                            "Q8: How much hot sauce would you add to this food item?", "Label"]
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_categorical = pd.DataFrame(encoder.fit_transform(df[categorical_cols]),
                                           columns=encoder.get_feature_names_out(categorical_cols))
        print(f"Shape of encoded categorical features: {encoded_categorical.shape}")

        final_df = pd.concat([df["id"], normalized_numerical, bow_q5, bow_q6, encoded_categorical], axis=1)
    elif mode == "softmax":
        # One-hot encode only the Label column
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_label = pd.DataFrame(encoder.fit_transform(df[["Label"]]),
                                     columns=encoder.get_feature_names_out(["Label"]))
        final_df = pd.concat([df["id"], bow_q5, bow_q6, encoded_label], axis=1)
    else:
        raise ValueError("Unsupported mode. Use mode='full' or mode='softmax'.")

    final_df.dropna(inplace=True)  # Ensure no NaN values remain

    print(f"Preprocessed data shape: {final_df.shape}")
    print(f"Preprocessed data columns: {final_df.columns}")
    return final_df

Main:

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import sys


DATASET_PATH = "/content/drive/MyDrive/University/Courses/CSC311/Project/manual_cleaned_data_universal.csv"
def main():


    # Use the preprocess function from utils/preprocess (bag-of-words for Q5 & Q6)
    df = preprocess(DATASET_PATH, mode="full")


    # Identify label columns (assumed to start with "Label")
    label_cols = [col for col in df.columns if col.startswith("Label")]
    # Features are all columns except "id" and label columns
    feature_cols = [col for col in df.columns if col not in ["id"] + label_cols]

    X = df[feature_cols].to_numpy()
    y = df[label_cols].to_numpy()

    # Convert one-hot encoded labels to class indices
    y = np.argmax(y, axis=1)

    print(f"Final data matrix X shape: {X.shape}")
    print(f"Labels shape: {y.shape}")

    # Train-test split (80% training, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    model = QuadraticDiscriminantAnalysis()
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"Accuracy: {accuracy}")

if __name__ == '__main__':


    main()

Dropped 0 rows out of 1644 due to missing values.
Shape of bow_q5: (1644, 100)
Shape of bow_q6: (1644, 50)
Shape of normalized numerical features: (1637, 3)
Shape of encoded categorical features: (1637, 98)
Preprocessed data shape: (1630, 252)
Preprocessed data columns: Index(['id',
       'Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'Q2 Cleaned', 'Q4 Cleaned', 'about', 'aladdin', 'alone', 'and', 'anime',
       'any',
       ...
       'Q7: When you think about this food item, who does it remind you of?_Teachers,Strangers',
       'Q7: When you think about this food item, who does it remind you of?_nan',
       'Q8: How much hot sauce would you add to this food item?_A little (mild)',
       'Q8: How much hot sauce would you add to this food item?_A lot (hot)',
       'Q8: How much hot sauce would you add to this food item?_A moderate amount (medium)',
       'Q8: How much hot sauce would you add to thi



**Gower Distance for mixed data types:**


\begin{align*}
    \\
    \text{Numerical Features:}\\
    d_{\text{num}}(x, y) &= \frac{|x_i - y_i|}{\text{range}(i)} \quad \text{(Normalized by feature range)} \\
    \text{Categorical Features:}\\
    d_{\text{cat}}(x, y) &= \begin{cases}
        0 & \text{if } x_i = y_i \\
        1 & \text{otherwise}
    \end{cases} \\
    \text{Total Distance:}\\
    D_{\text{Gower}} &= \frac{\sum d_i}{\text{valid features count}}
\end{align*}

