In [None]:
from google.colab import drive
drive.mount('/content/drive')



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, Lambda, BatchNormalization, Reshape, MultiHeadAttention, LayerNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.cluster import KMeans
from transformers import BertTokenizer, TFBertModel
from scipy.stats import uniform
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scipy.spatial.distance import cosine

# Load the data
df = pd.read_csv("/content/drive/My Drive/food_classes_edited_twice.csv", na_values=["<NA>", "nan", "Nill", "Nil"])
df = df.head(25000)  # Use more data if available

# Data Preprocessing
def preprocess_data(df):
    df['uom_criteria'].fillna(method='ffill', inplace=True)
    df['conversion'].fillna(method='ffill', inplace=True)
    df['price_new'].fillna(df['price_new'].mean(), inplace=True)
    df['price_uom'].fillna(df['price_uom'].mean(), inplace=True)
    df.drop(['dob_new', 'age_group', 'Unnamed: 0'], axis=1, inplace=True)

    le = LabelEncoder()
    categorical_cols = ['item_type', 'class_name', 'subclass_name', 'customer_type', 'standard_uom', 'class_name_uom']
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

    # One-hot encode the 'nova' column
    ohe = OneHotEncoder(sparse=False)
    nova_encoded = ohe.fit_transform(df[['nova']])
    nova_columns = [f'nova_{i}' for i in range(nova_encoded.shape[1])]
    df[nova_columns] = nova_encoded
    df.drop('nova', axis=1, inplace=True)

    df['original_price'] = df['price_new']
    scaler = StandardScaler()
    numerical_cols = ['price_new', 'conversion', 'price_uom']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Feature Engineering
def engineer_features(df):
    df['price_per_unit'] = df['price_new'] / df['conversion']
    df['health_score'] = df['nova_0'] * 3 + df['nova_1'] * 2 + df['nova_2'] * 1 + df['nova_3'] * 0
    df['price_category'] = pd.qcut(df['price_new'], q=5, labels=[1, 2, 3, 4, 5])
    df['is_brand'] = df['description'].str.contains('brand', case=False).astype(int)
    return df

# Main process
df = preprocess_data(df)
df = engineer_features(df)

print(f"Number of unique transactions: {df['transaction_id'].nunique()}")
print(f"Number of unique items: {df['description'].nunique()}")

# Market Basket Analysis
transactions = df.groupby('transaction_id')['description'].apply(list).values.tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(transaction_df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules.sort_values('lift', ascending=False)

# BERT Fine-tuning
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the descriptions
descriptions = df['description'].tolist()
encoded_inputs = tokenizer(descriptions, padding=True, truncation=True, return_tensors="tf", max_length=128)

# Get the actual sequence length from the encoded inputs
seq_length = encoded_inputs['input_ids'].shape[1]

# Define the model using functional API
def build_attention_model(n_items, embedding_size, feature_size, n_classes):
    input_target = Input((1,), name='input_target')
    input_context = Input((1,), name='input_context')
    input_features = Input((feature_size,), name='input_features')

    embedding = Embedding(n_items, embedding_size, input_length=1, name='embedding')
    target_embedding = embedding(input_target)
    context_embedding = embedding(input_context)

    target = Flatten()(target_embedding)
    context = Flatten()(context_embedding)

    target = Reshape((1, embedding_size))(target)
    context = Reshape((1, embedding_size))(context)

    attention = MultiHeadAttention(num_heads=4, key_dim=embedding_size)
    attn_out = attention(query=target, key=context, value=context)
    attn_out = Flatten()(attn_out)
    attn_out = LayerNormalization()(attn_out)

    concat = Concatenate()([attn_out, input_features])
    hidden = Dense(128, activation='relu')(concat)
    hidden = Dropout(0.55)(hidden)  # Dropout layer with 50% dropout rate
    hidden = Dense(64, activation='relu')(hidden)
    hidden = Dropout(0.55)(hidden)  # Another Dropout layer
    output = Dense(n_classes, activation='softmax')(hidden)

    model = Model(inputs=[input_target, input_context, input_features], outputs=output)
    return model


# Prepare data for the model
n_items = df['description'].nunique()
embedding_size = 32
feature_size = df[['price_new', 'price_per_unit', 'health_score', 'price_category', 'is_brand']].shape[1]
n_classes = 4  # number of nova classes

# Create item pairs
item_to_index = {item: idx for idx, item in enumerate(df['description'].unique())}
index_to_item = {idx: item for item, idx in item_to_index.items()}

item_pairs = []
item_features = []
item_labels = []

for _, group in df.groupby('transaction_id'):
    items = group['description'].tolist()
    for i in range(len(items)):
        for j in range(len(items)):
            if i != j:
                item_pairs.append([item_to_index[items[i]], item_to_index[items[j]]])
                item_features.append(group[['price_new', 'price_per_unit', 'health_score', 'price_category', 'is_brand']].iloc[i].values)
                item_labels.append(group[['nova_0', 'nova_1', 'nova_2', 'nova_3']].iloc[i].values)

item_pairs = np.array(item_pairs)
item_features = np.array(item_features)
item_labels = np.array(item_labels)

# Building and training the model
model = build_attention_model(n_items, embedding_size, feature_size, n_classes)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

# Training within cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_index, val_index in kf.split(item_pairs):
    X_train, X_val = item_pairs[train_index], item_pairs[val_index]
    f_train, f_val = item_features[train_index], item_features[val_index]
    y_train, y_val = item_labels[train_index], item_labels[val_index]

    model.fit([X_train[:, 0], X_train[:, 1], f_train], y_train,
              epochs=5, batch_size=32,
              validation_data=([X_val[:, 0], X_val[:, 1], f_val], y_val),
              verbose=1, callbacks=[early_stopping, reduce_lr])

    score = model.evaluate([X_val[:, 0], X_val[:, 1], f_val], y_val, verbose=0)
    cv_scores.append(score[1])

print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

# Extract item embeddings
item_embeddings = model.get_layer('embedding').get_weights()[0]

# Function to get recommendations with relevance calculation
def cosine_similarity(a, b):
    return 1 - cosine(a, b)

def get_recommendations(item_id, budget, top_n=5):
    item_rules = rules[rules['antecedents'].apply(lambda x: item_id in x)]
    item_idx = item_to_index[item_id]
    item_embedding = item_embeddings[item_idx]
    similarities = np.array([cosine_similarity(item_embedding, emb) for emb in item_embeddings])

    recommendations = []
    total_cost = 0
    considered_items = set()
    total_relevance = 0

    for _, rule in item_rules.iterrows():
        for item in rule['consequents']:
            if item not in considered_items and item != item_id:
                considered_items.add(item)
                item_price = df[df['description'] == item]['original_price'].iloc[0]
                if total_cost + item_price <= budget:
                    recommendations.append((item, similarities[item_to_index[item]]))
                    total_cost += item_price
                    total_relevance += similarities[item_to_index[item]]
                    if len(recommendations) == top_n:
                        avg_relevance = total_relevance / top_n
                        return [r[0] for r in sorted(recommendations, key=lambda x: x[1], reverse=True)], total_cost, avg_relevance

    for item_idx in similarities.argsort()[::-1]:
        item = index_to_item[item_idx]
        if item not in considered_items and item != item_id:
            considered_items.add(item)
            item_price = df[df['description'] == item]['original_price'].iloc[0]
            if total_cost + item_price <= budget:
                recommendations.append((item, similarities[item_idx]))
                total_cost += item_price
                total_relevance += similarities[item_idx]
                if len(recommendations) == top_n:
                    break

    avg_relevance = total_relevance / len(recommendations) if recommendations else 0
    return [r[0] for r in sorted(recommendations, key=lambda x: x[1], reverse=True)], total_cost, avg_relevance

# Example usage
item_id = df['description'].iloc[0]  # Example item ID
budget = 500  # Example budget
recommendations, total_cost, avg_relevance = get_recommendations(item_id, budget)
print(f"\nItem: {item_id}")
print(f"Recommended items: {recommendations}")
print(f"Total cost: ${total_cost:.2f}")
print(f"Average relevance: {avg_relevance:.4f}")

# Analyze errors
def analyze_errors(model, X_test, f_test, y_test, df, item_to_index, index_to_item):
    y_pred = model.predict([X_test[:, 0], X_test[:, 1], f_test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    errors = np.where(y_pred_classes != y_true_classes)[0]
    for idx in errors[:10]:  # Analyze first 10 errors
        target_item = index_to_item[X_test[idx, 0]]
        context_item = index_to_item[X_test[idx, 1]]
        true_class = y_true_classes[idx]
        pred_class = y_pred_classes[idx]
        print(f"Error: Target: {target_item}, Context: {context_item}")
        print(f"True class: {true_class}, Predicted class: {pred_class}")
        print("Features:", f_test[idx])
        print()

# Prepare test data for error analysis
X_train, X_test, f_train, f_test, y_train, y_test = train_test_split(item_pairs, item_features, item_labels, test_size=0.2, random_state=42)
model = build_attention_model(n_items, embedding_size, feature_size, n_classes)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.fit([X_train[:, 0], X_train[:, 1], f_train], y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping, reduce_lr])

analyze_errors(model, X_test, f_test, y_test, df, item_to_index, index_to_item)

