# DS 4420 Final Project: Data Preprocessing for MLP
This file prepares the data to be used for MLP (generates test and train sets)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 1. Data Cleaning

**a. Cleaning User Data**

In [None]:
# load data
users = pd.read_csv('PP_users.csv')

# check shape
print('Shape:', users.shape)

# check for duplicate user id's
print('\nDuplicates:', sum(users.u.value_counts() > 1))

# check for nulls
print('\nNull Values:')
print(users.isna().astype(int).sum())

# dropping arbitrary columns
print('\nDropped Columns: n_items, techniques')
users.drop(['n_items', 'techniques'], axis=1, inplace=True)

# rename columns for consistency
users.rename(columns={'u':'user_id', 'items':'recipes'}, inplace=True)

# check datatypes
print('\nDatatypes:')
print(users.dtypes)

# display final shape
print('\nUsers Shape:', users.shape)
print('\n')

users.head()

**b. Cleaning Recipe Data**

In [None]:
# load data
recipes = pd.read_csv('RAW_recipes.csv')

# check shape
print('Shape:', recipes.shape)

# check for duplicate user id's
print('\nDuplicates:', sum(recipes.id.value_counts() > 1))

# check for nulls
print('\nNull Values:')
print(recipes.isna().astype(int).sum())

# drop null name
recipes.dropna(subset='name', inplace=True)

# rename columns for consistency
recipes.rename(columns={'id':'recipe_id'}, inplace=True)

# apply list transformations for relevant columns
recipes['tags'] = recipes['tags'].apply(literal_eval)

# check datatypes
print('\nDatatypes:')
print(recipes.dtypes)

# display final shape
print('\nRecipes Shape:', recipes.shape)
print('\n')

recipes.head()

**c. Cleaning Interactions Data**

In [None]:
# load data
interactions = pd.read_csv('RAW_interactions.csv')

# check shape
print('Shape:', interactions.shape)

# check for duplicate user id's
print('\nDuplicates:', sum(interactions.groupby(['user_id','recipe_id']).count().reset_index()[['user_id','recipe_id']].value_counts() > 1))

# check for nulls
print('\nNull Values:')
print(interactions.isna().astype(int).sum())

# check datatypes
print('\nDatatypes:')
print(interactions.dtypes)

# display final shape
print('\nInteractions Shape:', interactions.shape)
print('\n')

interactions.head()

## 2. Preprocessing

In [None]:
# prepare ratings data
ratings = interactions.drop(['review'], axis=1)  # keep date for seasonality features

# extract seasonality features from date
ratings['date'] = pd.to_datetime(ratings['date'])
ratings['month'] = ratings['date'].dt.month
ratings['day_of_week'] = ratings['date'].dt.dayofweek
ratings['is_weekend'] = (ratings['day_of_week'] >= 5).astype(int)
ratings['year'] = ratings['date'].dt.year  # fixed typo from 'fratings' to 'ratings'

# merge datasets
recipes_mlp = pd.merge(recipes, ratings, on='recipe_id', how='right')

# remove duplicates
initial_rows = len(recipes_mlp)
recipes_mlp = recipes_mlp.drop_duplicates(subset=['user_id', 'recipe_id'])
print(f"removed {initial_rows - len(recipes_mlp)} duplicate ratings")

# keep only last 3 years of data
latest_date = recipes_mlp["year"].max()
three_years_ago = latest_date - 3
print("dataset latest date:", latest_date)
print("dataset 3 years prior to latest date:", three_years_ago)
recipes_mlp = recipes_mlp[recipes_mlp["year"] > three_years_ago]
recipes_mlp.reset_index(drop=True, inplace=True)
print("filtered dataset shape:", recipes_mlp.shape)

# plot distribution of dates
plt.figure(figsize=(10, 3))
plt.hist(recipes_mlp['date'].dt.year, bins=3)
plt.title('Distribution of Ratings by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

recipes_mlp.drop(['name', 'contributor_id', 'nutrition', 'submitted', 'description', 'steps'], axis=1, inplace=True)
recipes_mlp.head()

**Balancing & Filtering**

In [None]:
# check rating distribution for balancing
plt.figure(figsize=(10, 3))
plt.hist(recipes_mlp['rating'], bins=5)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# create a more balanced dataset with undersampling
rating_counts = recipes_mlp['rating'].value_counts().sort_index()
print("original rating counts:", rating_counts)

# find reasonable target count for each rating
min_counts = [count for count in rating_counts.values if count > 1000]
if min_counts:
    target_count = min(min(min_counts), 50000)
else:
    target_count = 10000

print(f"target count per rating: {target_count}")

# balance ratings
balanced_df = []
for rating in range(1, 6):
    if rating in rating_counts.index:
        if rating_counts[rating] > target_count:
            # undersample
            balanced_df.append(recipes_mlp[recipes_mlp['rating'] == rating].sample(target_count))
            print(f"sampled {target_count} from rating {rating}")
        else:
            # keep all
            balanced_df.append(recipes_mlp[recipes_mlp['rating'] == rating])
            print(f"kept all {rating_counts[rating]} examples of rating {rating}")

recipes_mlp = pd.concat(balanced_df)

# show balanced distribution
plt.figure(figsize=(10, 3))
plt.hist(recipes_mlp['rating'], bins=5)
plt.title('Balanced Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# map id's
user_map = {id: i for i, id in enumerate(recipes_mlp['user_id'].unique())}
recipe_map = {id: i for i, id in enumerate(recipes_mlp['recipe_id'].unique())}
recipes_mlp['user_id_mapped'] = recipes_mlp['user_id'].map(user_map)
recipes_mlp['recipe_id_mapped'] = recipes_mlp['recipe_id'].map(recipe_map)

# identify users with enough ratings
user_counts = recipes_mlp['user_id'].value_counts()
valid_users = user_counts[user_counts >= 2].index

# filter to include only valid users
recipes_mlp_filtered = recipes_mlp[recipes_mlp['user_id'].isin(valid_users)]
recipes_mlp_filtered.head()

print(recipes_mlp_filtered.shape)

**Train Test Split & Scaling**

In [None]:
# train-test split, stratify by rating
train_df, test_df = train_test_split(
    recipes_mlp_filtered,
    test_size=0.2,
    stratify=recipes_mlp_filtered['rating'],
    random_state=42
)

# scale numeric variables, fit using the train set
scaler_numeric = StandardScaler()
numeric = ['minutes', 'n_steps', 'n_ingredients', 'month', 'day_of_week', 'year']
train_df[numeric] = scaler_numeric.fit_transform(train_df[numeric])
test_df[numeric] = scaler_numeric.transform(test_df[numeric])

# parse nutrition data
n_lists = [n.strip('[]').split(',') for n in recipes.nutrition]
nutrition_array = np.array([
    [float(item.strip()) for item in sublist] 
    for sublist in n_lists
])

# create nutrition dataframe
nutrition_df = pd.DataFrame(
    nutrition_array, 
    columns=['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbs'],
    index=recipes.recipe_id
)

# scale nutrition features
scaler_nutrition = StandardScaler()
train_nutrition = pd.merge(train_df[['recipe_id']], nutrition_df, left_on='recipe_id', right_index=True, how='left')
test_nutrition = pd.merge(test_df[['recipe_id']], nutrition_df, left_on='recipe_id', right_index=True, how='left')

# fit scaler only on training data
train_nutrition_values = scaler_nutrition.fit_transform(train_nutrition.drop('recipe_id', axis=1))
test_nutrition_values = scaler_nutrition.transform(test_nutrition.drop('recipe_id', axis=1))

# convert back to dataframe
train_nutrition_scaled = pd.DataFrame(
    train_nutrition_values,
    columns=nutrition_df.columns,
    index=train_nutrition.recipe_id
)
test_nutrition_scaled = pd.DataFrame(
    test_nutrition_values,
    columns=nutrition_df.columns,
    index=test_nutrition.recipe_id
)

# merge nutrition back with main data
train_df = pd.merge(train_df, train_nutrition_scaled, left_on='recipe_id', right_index=True, how='left')
test_df = pd.merge(test_df, test_nutrition_scaled, left_on='recipe_id', right_index=True, how='left')

# drop the date field before training
if 'date' in train_df.columns:
    train_df = train_df.drop('date', axis=1)
    test_df = test_df.drop('date', axis=1)

# final dataset shapes
print("final train set shape:", train_df.shape)
print("final test set shape:", test_df.shape)

**Encoding Tags**

In [None]:
# explode tags into separate rows
tags_unnested = recipes_mlp.explode('tags')[['recipe_id', 'tags']]

# get common tags based only on training data
train_tags_unnested = tags_unnested[tags_unnested['recipe_id'].isin(train_df['recipe_id'])]
tag_count = train_tags_unnested.groupby('tags').count().sort_values(by='recipe_id', ascending=False)
common_tags = list(tag_count[tag_count.recipe_id > 50000].index)  # reduced threshold to capture more tags

# filter and encode tags
tags_filtered = tags_unnested[tags_unnested['tags'].isin(common_tags)]
tags_encoded = pd.get_dummies(tags_filtered, columns=['tags'], prefix='', prefix_sep='')
tags_encoded = tags_encoded.groupby('recipe_id').sum().reset_index()

# add tags to train and test separately
train_df = pd.merge(train_df, tags_encoded, on='recipe_id', how='left')
test_df = pd.merge(test_df, tags_encoded, on='recipe_id', how='left')


**Handling NaN and Bias**

In [None]:
# fill nan values with median for each column (better than filling with 0)
for col in train_df.columns:
    if train_df[col].dtype in [np.float64, np.int64] and col not in ['user_id', 'recipe_id', 'rating', 'user_id_mapped', 'recipe_id_mapped']:
        median_val = train_df[col].median()
        train_df[col] = train_df[col].fillna(median_val)
        test_df[col] = test_df[col].fillna(median_val)

# calculate biases
user_avg = train_df.groupby('user_id')['rating'].mean()
recipe_avg = train_df.groupby('recipe_id')['rating'].mean()
global_avg = train_df['rating'].mean()

# add these as features
train_df['user_bias'] = train_df['user_id'].map(user_avg) - global_avg
test_df['user_bias'] = test_df['user_id'].map(user_avg).fillna(0)  # use 0 for unknown users

train_df['recipe_bias'] = train_df['recipe_id'].map(recipe_avg) - global_avg
test_df['recipe_bias'] = test_df['recipe_id'].map(recipe_avg).fillna(0)  # use 0 for unknown recipes


**Final Cleaning**

In [None]:
# drop text columns not needed for modeling
test_df = test_df.drop(['tags', 'ingredients'], axis=1)
train_df = train_df.drop(['tags', 'ingredients'], axis=1)

# prepare clean copy
train_df_clean = train_df.copy()

# filter test data to include only users and recipes in training data
valid_users = set(train_df_clean['user_id'])
valid_recipes = set(train_df_clean['recipe_id'])
test_df_clean = test_df[test_df['user_id'].isin(valid_users) & test_df['recipe_id'].isin(valid_recipes)]
print(f"test data after filtering for valid users/recipes: {len(test_df_clean)} rows")

# extract user and recipe id features
X_user_train = train_df_clean['user_id_mapped'].values
X_recipe_train = train_df_clean['recipe_id_mapped'].values
y_train = train_df_clean['rating'].values

X_user_test = test_df_clean['user_id_mapped'].values
X_recipe_test = test_df_clean['recipe_id_mapped'].values
y_test = test_df_clean['rating'].values

# extract other features, excluding ids and target
feature_cols = [col for col in train_df_clean.columns 
                if col not in ['user_id', 'recipe_id', 'rating', 'user_id_mapped', 'recipe_id_mapped']]
X_features_train = train_df_clean[feature_cols].values
X_features_test = test_df_clean[feature_cols].values

# final nan check and replacement
X_features_train = np.nan_to_num(X_features_train, nan=0)
X_features_test = np.nan_to_num(X_features_test, nan=0)

# print final shapes
print("training data shape:", X_features_train.shape)
print("testing data shape:", X_features_test.shape)
print("number of feature columns:", len(feature_cols))
print("feature names:", feature_cols)

## 3. Exporting

In [None]:
# Check data types
print("X_user_train dtype:", X_user_train.dtype)
print("X_recipe_train dtype:", X_recipe_train.dtype)
print("X_features_train dtype:", X_features_train.dtype)
print("y_train dtype:", y_train.dtype)

# Convert arrays to proper numeric types
X_user_train = X_user_train.astype(np.int32)
X_recipe_train = X_recipe_train.astype(np.int32)
X_features_train = X_features_train.astype(np.float32)
y_train = y_train.astype(np.float32)

X_user_test = X_user_test.astype(np.int32)
X_recipe_test = X_recipe_test.astype(np.int32)
X_features_test = X_features_test.astype(np.float32)
y_test = y_test.astype(np.float32)

# Verify data shapes
print("X_user_train shape:", X_user_train.shape)
print("X_recipe_train shape:", X_recipe_train.shape)
print("X_features_train shape:", X_features_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
np.savetxt('X_user_train.csv', X_user_train, delimiter=',')
np.savetxt('X_recipe_train.csv', X_recipe_train, delimiter=',')
np.savetxt('X_features_train.csv', X_features_train, delimiter=',')
np.savetxt('y_train.csv', y_train, delimiter=',')
np.savetxt('X_user_test.csv', X_user_test, delimiter=',')
np.savetxt('X_recipe_test.csv', X_recipe_test, delimiter=',')
np.savetxt('X_features_test.csv', X_features_test, delimiter=',')
np.savetxt('y_test.csv', y_test, delimiter=',')

recipes_mlp.to_csv('recipes_mlp.csv')