In [56]:
import pandas as pd
import numpy as np

df = pd.read_csv('./cleaned_data_combined.csv')
new_column_names = {
    "Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)": "1",
    "Q2: How many ingredients would you expect this food item to contain?": "2",
    "Q3: In what setting would you expect this food to be served? Please check all that apply": "3",
    "Q4: How much would you expect to pay for one serving of this food item?": "4",
    "Q5: What movie do you think of when thinking of this food item?": "5",
    "Q6: What drink would you pair with this food item?": "6",
    "Q7: When you think about this food item, who does it remind you of?": "7",
    "Q8: How much hot sauce would you add to this food item?": "8",
    "Label": "label"
}
df = df.rename(columns=new_column_names)

N = len(df)
unseen_size = int(0.1 * N)
indices = np.arange(N)

np.random.seed(42)
shuffled_idxs = np.random.permutation(indices)
unseen_idxs = shuffled_idxs[:unseen_size]
rest_idxs   = shuffled_idxs[unseen_size:]

# Save the unseen portion to a separate CSV
df_unseen = df.iloc[unseen_idxs].copy()
df_unseen.to_csv("unseen_data.csv", index=False)

df_clean = df.iloc[rest_idxs].copy()

def build_vocab(series):
    series = series.astype(str).str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)
    vocab = set()
    for entry in series.dropna():
        words = str(entry).split()
        vocab.update(words)
    return list(vocab)

def make_bow(series, vocab):
    X = np.zeros((len(series), len(vocab)), dtype=int)
    vocab_dict = {word: j for j, word in enumerate(vocab)}
    for i, entry in enumerate(series):
        words = set(str(entry).lower().split())
        for word in words:
            if word in vocab_dict:
                X[i, vocab_dict[word]] = 1
    return X

df_clean = df_clean.dropna(subset=['4']).copy()

df_clean['1'] = df_clean['1'].astype(int)

vocab_q2 = build_vocab(df_clean['2'])
vocab_q4 = build_vocab(df_clean['4'])
vocab_q5 = build_vocab(df_clean['5'])
vocab_q6 = build_vocab(df_clean['6'])

X_q2 = make_bow(df_clean['2'], vocab_q2)
X_q4 = make_bow(df_clean['4'], vocab_q4)
X_q5 = make_bow(df_clean['5'], vocab_q5)
X_q6 = make_bow(df_clean['6'], vocab_q6)

X_q3 = df_clean['3'].str.get_dummies(sep=",")
expected_q3_columns = list(X_q3.columns)

X_q7 = df_clean['7'].str.get_dummies(sep=",")
expected_q7_columns = list(X_q7.columns)

df_clean['8'] = df_clean['8'].astype("category").cat.codes

df_clean['label'] = df_clean['label'].astype("category").cat.codes
y = df_clean['label'].values

features = np.hstack([
    df_clean['1'].values.reshape(-1,1),
    X_q2,
    X_q3.values,
    X_q4,
    X_q5,
    X_q6,
    X_q7.values,
    df_clean['8'].values.reshape(-1,1)
])

std_devs = features.std(axis=0)
non_constant_cols = (std_devs != 0)
features = features[:, non_constant_cols]

features = np.nan_to_num(features)

X_mean = features.mean(axis=0)
X_std  = features.std(axis=0)
epsilon_norm = 1e-8
features = (features - X_mean)/(X_std + epsilon_norm)

N_rest = len(features)
idx2 = np.random.permutation(N_rest)
train_end = int(0.7 * N_rest)
val_end   = int(0.85 * N_rest)

X_train = features[idx2[:train_end]]
y_train = y[idx2[:train_end]]

X_val = features[idx2[train_end:val_end]]
y_val = y[idx2[train_end:val_end]]

X_test = features[idx2[val_end:]]
y_test = y[idx2[val_end:]]

print("X_train shape:", X_train.shape)
print("X_val   shape:", X_val.shape)
print("X_test  shape:", X_test.shape)
print("Unseen portion saved as 'unseen_data.csv' with shape:", df_unseen.shape)

np.savez("vocabularies.npz",
    vocab_q2=np.array(vocab_q2, dtype=object),
    vocab_q4=np.array(vocab_q4, dtype=object),
    vocab_q5=np.array(vocab_q5, dtype=object),
    vocab_q6=np.array(vocab_q6, dtype=object),
    expected_q3_columns=np.array(expected_q3_columns, dtype=object),
    expected_q7_columns=np.array(expected_q7_columns, dtype=object)
)

np.savez("mlp_params.npz",
    non_constant_columns=non_constant_cols,
    X_mean=X_mean,
    X_std=X_std
)

print("Done building features and saving 'unseen_data.csv'!")


X_train shape: (1035, 1882)
X_val   shape: (222, 1882)
X_test  shape: (222, 1882)
Unseen portion saved as 'unseen_data.csv' with shape: (164, 10)
Done building features and saving 'unseen_data.csv'!


Logistic Regression

In [57]:

import numpy as np

def softmax(z):
    shifted = z - np.max(z, axis=1, keepdims=True)
    exps = np.exp(shifted)
    return exps / np.sum(exps, axis=1, keepdims=True)

def cross_entropy_loss(y_true, y_pred):
    eps = 1e-15
    n = y_true.shape[0]
    clipped = np.clip(y_pred, eps, 1 - eps)
    y_one_hot = np.zeros_like(y_pred)
    y_one_hot[np.arange(n), y_true] = 1
    return -np.sum(y_one_hot * np.log(clipped)) / n

def train_multiclass_logreg(X_train, y_train, lr=0.01, epochs=1000, reg=0.0):
    n_samples, n_features = X_train.shape
    n_classes = len(np.unique(y_train))

    X_bias = np.hstack((np.ones((n_samples, 1)), X_train))

    W = np.zeros((n_features + 1, n_classes))

    for epoch in range(epochs):
        logits = X_bias @ W  # shape: (n_samples, n_classes)
        probs = softmax(logits)

        loss = cross_entropy_loss(y_train, probs)
        if reg > 0.0:
            loss += 0.5 * reg * np.sum(W[1:]**2)  # exclude bias term

        y_one_hot = np.zeros_like(probs)
        y_one_hot[np.arange(n_samples), y_train] = 1
        grad = X_bias.T @ (probs - y_one_hot) / n_samples  # shape: (n_features+1, n_classes)
        if reg > 0.0:
            grad[1:] += reg * W[1:]  # exclude bias term

        W -= lr * grad

        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{epochs}, Loss={loss:.6f}")

    return W

def predict_multiclass_logreg(X, W):
    n_samples = X.shape[0]
    X_bias = np.hstack((np.ones((n_samples,1)), X))
    logits = X_bias @ W
    probs = softmax(logits)
    return np.argmax(probs, axis=1)


print("Training multi-class logistic regression...")
W_trained = train_multiclass_logreg(X_train, y_train, lr=0.01, epochs=2000, reg=0.0)

train_preds = predict_multiclass_logreg(X_train, W_trained)
train_acc = np.mean(train_preds == y_train)
print("Train Accuracy:", train_acc)

val_preds = predict_multiclass_logreg(X_val, W_trained)
val_acc = np.mean(val_preds == y_val)
print("Validation Accuracy:", val_acc)

test_preds = predict_multiclass_logreg(X_test, W_trained)
test_acc = np.mean(test_preds == y_test)
print("Test Accuracy:", test_acc)



Training multi-class logistic regression...
Epoch 0/2000, Loss=1.098612
Epoch 100/2000, Loss=0.354030
Epoch 200/2000, Loss=0.232913
Epoch 300/2000, Loss=0.182194
Epoch 400/2000, Loss=0.153468
Epoch 500/2000, Loss=0.134570
Epoch 600/2000, Loss=0.120983
Epoch 700/2000, Loss=0.110630
Epoch 800/2000, Loss=0.102408
Epoch 900/2000, Loss=0.095676
Epoch 1000/2000, Loss=0.090033
Epoch 1100/2000, Loss=0.085213
Epoch 1200/2000, Loss=0.081032
Epoch 1300/2000, Loss=0.077359
Epoch 1400/2000, Loss=0.074099
Epoch 1500/2000, Loss=0.071179
Epoch 1600/2000, Loss=0.068542
Epoch 1700/2000, Loss=0.066145
Epoch 1800/2000, Loss=0.063953
Epoch 1900/2000, Loss=0.061939
Train Accuracy: 0.991304347826087
Validation Accuracy: 0.8783783783783784
Test Accuracy: 0.8918918918918919


# Decision Tree

In [58]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature   = feature
        self.threshold = threshold
        self.left      = left
        self.right     = right
        self.value     = value

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='gini'):
        self.max_depth         = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf  = min_samples_leaf
        self.criterion         = criterion
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(set(y))==1:
            return Node(value=y[0])
        if self.max_depth and depth>=self.max_depth:
            return Node(value=max(set(y), key=list(y).count))
        if len(y) < self.min_samples_split:
            return Node(value=max(set(y), key=list(y).count))

        best_feat, best_thresh = self._best_split(X, y)
        if best_feat is None:
            return Node(value=max(set(y), key=list(y).count))

        left_idx = [i for i in range(len(X)) if X[i][best_feat]<best_thresh]
        right_idx= [i for i in range(len(X)) if X[i][best_feat]>=best_thresh]

        if len(left_idx)<self.min_samples_leaf or len(right_idx)<self.min_samples_leaf:
            return Node(value=max(set(y), key=list(y).count))

        left_child  = self._build_tree([X[i] for i in left_idx],
                                       [y[i] for i in left_idx],
                                       depth+1)
        right_child = self._build_tree([X[i] for i in right_idx],
                                       [y[i] for i in right_idx],
                                       depth+1)
        return Node(best_feat, best_thresh, left_child, right_child)

    def _best_split(self, X, y):
        best_gini = float('inf')
        best_feat = None
        best_thr  = None
        for feat in range(len(X[0])):
            all_vals = set(row[feat] for row in X)
            for t in all_vals:
                l_y = [y[i] for i in range(len(X)) if X[i][feat]<t]
                r_y = [y[i] for i in range(len(X)) if X[i][feat]>=t]
                g   = self._gini_impurity(l_y, r_y)
                if g<best_gini:
                    best_gini = g
                    best_feat = feat
                    best_thr  = t
        return best_feat, best_thr

    def _gini_impurity(self, ly, ry):
        def gini(arr):
            cnts = [arr.count(c) for c in set(arr)]
            total= len(arr)
            return 1 - sum((c/total)**2 for c in cnts)
        n = len(ly)+len(ry)
        gl= gini(ly)
        gr= gini(ry)
        return (len(ly)/n)*gl + (len(ry)/n)*gr

    def predict(self, X):
        return [self._predict_sample(x, self.root) for x in X]

    def _predict_sample(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature]<node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)

    def accuracy(self, X, y):
        preds = self.predict(X)
        return np.mean([p==t for p,t in zip(preds,y)])

print("Starting DecisionTree training (multi-class)...")
dt = DecisionTree(max_depth=10, min_samples_split=10, min_samples_leaf=2, criterion='gini')
dt.fit(X_train.tolist(), y_train.tolist())
train_acc_dt = dt.accuracy(X_train.tolist(), y_train.tolist())
test_acc_dt  = dt.accuracy(X_test.tolist(),  y_test.tolist())
print("DecisionTree Train Acc:", train_acc_dt)
print("DecisionTree Test Acc:",  test_acc_dt)
np.savez("decision_tree.npz", tree=dt.root)

Starting DecisionTree training (multi-class)...
DecisionTree Train Acc: 0.8966183574879227
DecisionTree Test Acc: 0.7927927927927928


MLPClassifier

In [59]:
class MLPClassifier:
    def __init__(self, hidden_layer_sizes=(150,), activation='relu', solver='adam',
                 max_iter=200, learning_rate=0.001, batch_size=32, random_state=None,
                 beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.solver = solver
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        if random_state is not None:
            np.random.seed(random_state)

        self.weights = []
        self.biases = []
        self.n_layers = len(hidden_layer_sizes) + 1
        self.n_classes = None
        self.initialized = False

        self.m_weights = []
        self.v_weights = []
        self.m_biases  = []
        self.v_biases  = []
        self.t = 0

    def _initialize_parameters(self, n_features, n_classes):
        self.n_classes = n_classes
        layer_sizes = [n_features] + list(self.hidden_layer_sizes) + [n_classes]
        for i in range(1, len(layer_sizes)):
            scale = np.sqrt(2.0 / layer_sizes[i-1])
            W = np.random.randn(layer_sizes[i-1], layer_sizes[i]) * scale
            b = np.zeros(layer_sizes[i])
            self.weights.append(W)
            self.biases.append(b)
            if self.solver == 'adam':
                self.m_weights.append(np.zeros_like(W))
                self.v_weights.append(np.zeros_like(W))
                self.m_biases.append(np.zeros_like(b))
                self.v_biases.append(np.zeros_like(b))
        self.initialized = True

    def _activation_function(self, z, derivative=False):
        if self.activation == 'relu':
            if not derivative:
                return np.maximum(0, z)
            return (z > 0).astype(float)
        elif self.activation == 'tanh':
            if not derivative:
                return np.tanh(z)
            return 1 - np.tanh(z)**2
        else:
            raise ValueError("Unsupported activation function")

    def _softmax(self, z):
        exps = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)

    def _forward_pass(self, X):
        activations = [X]
        pre_acts    = []
        for i in range(self.n_layers - 1):
            z = activations[-1] @ self.weights[i] + self.biases[i]
            pre_acts.append(z)
            a = self._activation_function(z)
            activations.append(a)
        z_out = activations[-1] @ self.weights[-1] + self.biases[-1]
        pre_acts.append(z_out)
        out = self._softmax(z_out)
        activations.append(out)
        return activations, pre_acts

    def _compute_cost(self, y_true, y_pred):
        m = y_true.shape[0]
        y_true_1hot = np.zeros((m, self.n_classes))
        y_true_1hot[np.arange(m), y_true] = 1
        eps = 1e-15
        logp = -np.log(np.maximum(y_pred, eps)) * y_true_1hot
        return np.sum(logp)/m

    def _backpropagation(self, X, y, activations, pre_acts):
        m = X.shape[0]
        y_1hot = np.zeros((m, self.n_classes))
        y_1hot[np.arange(m), y] = 1

        dW = [None]*self.n_layers
        db = [None]*self.n_layers

        delta = activations[-1] - y_1hot
        dW[-1] = activations[-2].T @ delta / m
        db[-1] = np.mean(delta, axis=0)

        for i in range(self.n_layers - 2, -1, -1):
            delta = (delta @ self.weights[i+1].T) * self._activation_function(pre_acts[i], derivative=True)
            dW[i] = activations[i].T @ delta / m
            db[i] = np.mean(delta, axis=0)
        return dW, db

    def _update_params(self, dW, db):
        self.t += 1
        if self.solver == 'sgd':
            for i in range(self.n_layers):
                self.weights[i] -= self.learning_rate * dW[i]
                self.biases[i]  -= self.learning_rate * db[i]
        elif self.solver == 'adam':
            for i in range(self.n_layers):
                self.m_weights[i] = self.beta1*self.m_weights[i] + (1-self.beta1)*dW[i]
                self.v_weights[i] = self.beta2*self.v_weights[i] + (1-self.beta2)*(dW[i]**2)

                mw_corr = self.m_weights[i]/(1-self.beta1**self.t)
                vw_corr = self.v_weights[i]/(1-self.beta2**self.t)
                self.weights[i] -= self.learning_rate*mw_corr/(np.sqrt(vw_corr)+self.epsilon)

                self.m_biases[i] = self.beta1*self.m_biases[i] + (1-self.beta1)*db[i]
                self.v_biases[i] = self.beta2*self.v_biases[i] + (1-self.beta2)*(db[i]**2)

                mb_corr = self.m_biases[i]/(1-self.beta1**self.t)
                vb_corr = self.v_biases[i]/(1-self.beta2**self.t)
                self.biases[i] -= self.learning_rate*mb_corr/(np.sqrt(vb_corr)+self.epsilon)

    def _create_mini_batches(self, X, y):
        m = X.shape[0]
        idxs = np.random.permutation(m)
        X_shuf = X[idxs]
        y_shuf = y[idxs]
        n_full = m//self.batch_size
        mini_batches = []
        for i in range(n_full):
            start = i*self.batch_size
            end   = (i+1)*self.batch_size
            mini_batches.append((X_shuf[start:end], y_shuf[start:end]))
        if m%self.batch_size !=0:
            start = n_full*self.batch_size
            mini_batches.append((X_shuf[start:], y_shuf[start:]))
        return mini_batches

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        if not self.initialized:
            self._initialize_parameters(n_features, n_classes)

        for i in range(self.max_iter):
            batches = self._create_mini_batches(X, y)
            for (Xb, yb) in batches:
                acts, pre_acts = self._forward_pass(Xb)
                dW, db = self._backpropagation(Xb, yb, acts, pre_acts)
                self._update_params(dW, db)
            if i%10==0:
                acts_full, _ = self._forward_pass(X)
                loss = self._compute_cost(y, acts_full[-1])
                print(f"Iteration {i}/{self.max_iter}, Loss={loss:.6f}")
        return self

    def predict_proba(self, X):
        acts, _ = self._forward_pass(X)
        return acts[-1]

    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

    def score(self, X, y):
        return np.mean(self.predict(X)==y)

    def save(self, fname="mlp_model.npz"):
        np.savez(fname,
            weights=np.array(self.weights, dtype=object),
            biases=np.array(self.biases,  dtype=object)
        )

print("Starting multi-class MLP training...")

mlp = MLPClassifier(
    hidden_layer_sizes=(150,),
    activation='relu',
    solver='adam',
    max_iter=200,
    learning_rate=0.001,
    batch_size=32,
    random_state=42
)
mlp.fit(X_train, y_train)
train_acc = mlp.score(X_train, y_train)
val_acc   = mlp.score(X_val,   y_val)
test_acc  = mlp.score(X_test,  y_test)
print("Train Accuracy:", train_acc)
print("Val Accuracy:",   val_acc)
print("Test Accuracy:",  test_acc)
mlp.save("mlp_model.npz")

Starting multi-class MLP training...
Iteration 0/200, Loss=0.320512
Iteration 10/200, Loss=0.014655
Iteration 20/200, Loss=0.004611
Iteration 30/200, Loss=0.002205
Iteration 40/200, Loss=0.001250
Iteration 50/200, Loss=0.000796
Iteration 60/200, Loss=0.000534
Iteration 70/200, Loss=0.000381
Iteration 80/200, Loss=0.000279
Iteration 90/200, Loss=0.000209
Iteration 100/200, Loss=0.000160
Iteration 110/200, Loss=0.000124
Iteration 120/200, Loss=0.000098
Iteration 130/200, Loss=0.000078
Iteration 140/200, Loss=0.000062
Iteration 150/200, Loss=0.000050
Iteration 160/200, Loss=0.000040
Iteration 170/200, Loss=0.000032
Iteration 180/200, Loss=0.000026
Iteration 190/200, Loss=0.000022
Train Accuracy: 1.0
Val Accuracy: 0.8063063063063063
Test Accuracy: 0.8243243243243243
