In [None]:
%pip install fuzzywuzzy
%pip install wandb

In [None]:
!wandb login

In [None]:
# Import PyTorch and related packages
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

# Import packages for data manipulation and data splitting
import re
from fuzzywuzzy import process, fuzz

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif

# Logging / metrics
import wandb
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score

# Downloading datasets, and loading
from sklearn.datasets import fetch_openml

# Other
from collections import deque

In [None]:
# Fetch the 'adult' dataset from OpenML
dataset = fetch_openml(name='SpeedDating', version=1)

# Create a Pandas DataFrame
df = pd.DataFrame(data=np.c_[dataset['data'], dataset['target']],
                  columns=dataset['feature_names'] + ['target'])

# Display the first few rows
print(df.head())

In [None]:
# Fix grammatical errors in column names
columns_to_rename = {
    'd_sinsere_o': 'd_sincere_o',
    'sinsere_o': 'sincere_o',
    'intellicence_important': 'intelligence_important',
    'd_intellicence_important': 'd_intelligence_important',
    'ambtition_important': 'ambition_important',
    'ambitous_o': 'ambition_o',
    'd_ambitous_o': 'd_ambition_o',
    'd_ambitous_important': 'd_ambition_important',
    'pref_o_ambitious': 'pref_o_ambition'}

df.rename(columns=columns_to_rename, inplace=True)

In [None]:
def print_unique_for_column(df, column):
    unique_values = df[column].unique()
    sum = df[column].nunique(dropna=False)
    print(f"Unique values in column '{column}' with sum of {sum} (including NaN): {unique_values}")

def print_unique(df=df):
    # Loop through each column and print unique values
    for column in df.columns:
        print_unique_for_column(df, column)

print_unique(df)

## Data preprocessing

In [None]:
# Column field has a lot of problems
print(f"Sum: {df['field'].nunique(dropna=True)}")
sorted_unique_values = sorted(df['field'].dropna().unique())
for value in sorted_unique_values:
    print(value)

In [None]:
# Regular expression to match ;, :, -, /, and everything within []
pattern = r'[;:\-\/]|\[.*?\]'

# Replace matched patterns with an empty string
df['field'] = df['field'].str.replace(pattern, ' ', regex=True)

# Remove phd and remove duplicate space
df['field'] = df['field'].str.replace('phd', '', flags=re.IGNORECASE, regex=True).str.replace(' +', ' ', regex=True)

# Replace shortened engg. with engineering
df['field'] = df['field'].str.replace('engg.', 'engineering', flags=re.IGNORECASE, regex=True)

# Change everything to lowercase and remove white space
df['field'] = df['field'].str.lower().str.strip()
df['race'] = df['race'].str.lower().str.strip()
df['race_o'] = df['race_o'].str.lower().str.strip()

In [None]:
# Define a function to replace close matches
def combine_similar(df, column, correct_value, threshold=90):
    unique_values = df[column].unique()
    
    # Find matches above the threshold
    matches = process.extract(correct_value, unique_values, limit=None, scorer=fuzz.token_sort_ratio)
    close_matches = [match[0] for match in matches if match[1] >= threshold]
    
    # Replace close matches with the correct value
    df[column] = df[column].apply(lambda x: correct_value if x in close_matches else x)

# Mostly used to fix grammatical errors in some strings
combine_similar(df, 'field', 'finance')
combine_similar(df, 'field', 'nutrition')
combine_similar(df, 'field', 'speech language pathology')
combine_similar(df, 'field', 'international affairs')
combine_similar(df, 'field', 'finance economics')
combine_similar(df, 'field', 'mathematic')

In [None]:
# Dropped from 259 values to 203 not including NaN
print(f"Sum: {df['field'].nunique(dropna=True)}")
sorted_unique_values = sorted(df['field'].dropna().unique())
for value in sorted_unique_values:
    print(value)

In [None]:
def try_convert_float(value):
    try:
        return float(value)
    except ValueError:
        return value

# Some columns have numbers in string type so we use his to convert them to float
for column in df.columns:
  df[column] = df[column].apply(try_convert_float)

print_unique(df)

## Filling the NaN values

In [None]:
# Check how much of percentage is missing from every column
missing_values = (df.isnull().sum() / len(df)) * 100

for name, value in missing_values.items():
  print(f"{name}: {value:.2f}%")

In [None]:
# Column 'expected_num_interested_in_me' has a lot of missing values so we drop it
# We experimented by leaving it in but id did not change anything
columns_to_drop = ['has_null', 'wave', 'expected_num_interested_in_me']

df_features = df.drop(columns=columns_to_drop)

In [None]:
# This will select columns with data type 'object' or 'string'
string_columns = df_features.select_dtypes(include=['object', 'string'])

print_unique(string_columns)

In [None]:
nominal_columns = ['gender', 'race', 'race_o', 'field']
ordinal_columns = string_columns.drop(columns=nominal_columns)

# Encode with one hot encoder
df_encoded = pd.get_dummies(df_features, columns=nominal_columns, drop_first=False)

# Define the order of categories
order = {'[0-1]': 0.0, '[2-3]': 1.0, '[4-6]': 2.0, '[7-37]': 3.0,
         '[0-1]': 0.0, '[2-5]': 1.0, '[6-10]': 2.0,
         '[0-2]': 0.0, '[3-5]': 1.0, '[5-18]': 2.0,
         '[0-3]': 0.0, '[4-9]': 1.0, '[10-20]': 2.0,
         '[0-4]': 0.0, '[5-6]': 1.0, '[7-10]': 2.0,
         '[0-5]': 0.0, '[6-8]': 1.0, '[9-10]': 2.0, 
         '[0-15]': 0.0, '[16-20]': 1.0, '[21-100]': 2.0,
         '[-1-0]': 0.0, '[0-0.33]': 1.0, '[0.33-1]' : 2.0}

# Encode with ordinal encoder
for column in ordinal_columns:
    df_encoded[column] = df_encoded[column].map(order)

In [None]:
# Check if there are any more left
print_unique(df_encoded.select_dtypes(include=['object', 'string']))

In [None]:
# Plotting box plots to check outliers
df_encoded.plot(kind='box', figsize=(40, 20), vert=False)
plt.title('Box plot of all columns')
plt.show()

In [None]:
# Normalization of data
normalizer = MinMaxScaler()

df_normalized = normalizer.fit_transform(df_encoded)
df_normalized = pd.DataFrame(df_normalized, columns=df_encoded.columns)

In [None]:
# Fill the missing data with KNN
imputer = KNNImputer(n_neighbors=5)

imputed_data = imputer.fit_transform(df_normalized)
clean_data = pd.DataFrame(imputed_data, columns=df_encoded.columns)

In [None]:
# Calculate the absolute difference between preferences and ratings
compatibility_columns = ['attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'shared_interests']

for column in compatibility_columns:
  clean_data[f"{column}_compatibility_score"] = (
      (1 - abs(clean_data[f"pref_o_{column}"] - clean_data[f"{column}_partner"])) +
      (1 - abs(clean_data[f"{column}_important"] - clean_data[f"{column}_o"]))
  ) / 2

  print_unique_for_column(clean_data, f"{column}_compatibility_score")

clean_data['overall_compatibility'] = clean_data[[col + '_compatibility_score' for col in compatibility_columns]].mean(axis=1)
print_unique_for_column(clean_data, 'overall_compatibility')

In [None]:
X = clean_data.drop('target', axis=1)
y = clean_data['target']

mic = mutual_info_classif(X, y)

mic_series = pd.Series(mic, index=X.columns)
mic_series = mic_series.sort_values(ascending=False)

# Get only columns that are above the value
mic_series = mic_series[mic_series > 0.01]

# Show best correlations
mic_series.plot.bar(figsize=(15, 4))
plt.ylabel('Mutual Information Score')
plt.xlabel('Features')
plt.title('Mutual Information Scores')

In [None]:
# Use only selected
# features_data = clean_data[mic_series.index.tolist()]

# Got better results with all of the features
features_data = clean_data.drop('target', axis=1)
target_data = clean_data['target']
print(f"Features data shape is: {features_data.shape}")

# MLP

In [None]:
# Define the feature set X and the target variable y
X = features_data
y = target_data

# Let's split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)

# Verify the shape of each set
print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

In [None]:
# Convert Pandas DataFrames to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create TensorDataset for each set
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

# Verify the DataLoader objects
for inputs, labels in train_loader:
    print(f"Batch shape: {inputs.shape}, Label shape: {labels.shape}")
    break

In [None]:
# If available use GPU instead of CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, input_size: int = len(X.columns), hidden_size: int = 256, dropout_ratio: float = 0.2):
        super(SimpleMLP, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)

        # Hidden layer
        self.norm1 = nn.BatchNorm1d(hidden_size)
        self.hidden_layer1 = nn.Linear(hidden_size, hidden_size)
        self.dropout_layer1 = nn.Dropout(dropout_ratio)

        # Additional hidden layer
        self.hidden_layer2 = nn.Linear(hidden_size, hidden_size)
        self.norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout_layer2 = nn.Dropout(dropout_ratio)

        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.input_layer(x)
        x = F.leaky_relu(x)

        x = self.hidden_layer1(x) + x
        x = self.norm1(x)
        x = F.leaky_relu(x)
        x = self.dropout_layer1(x)
        
        x = self.hidden_layer2(x) + x
        x = self.norm2(x)
        x = F.leaky_relu(x)
        x = self.dropout_layer2(x)
        
        x = self.output_layer(x)
        
        return x

model = SimpleMLP().to(device)
optim = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

In [None]:
# Calculate the number of matched and not-matched samples
total_positive_samples = target_data[target_data == 1].count()
total_negative_samples = target_data[target_data == 0].count()

pos_weight = total_negative_samples / total_positive_samples
pos_weight_tensor = torch.tensor(pos_weight).float().to(device)

# Give more importance to the minority class
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

In [None]:
with wandb.init(
    project="Speed Dating",
    config={
    "Architecture": "MLP",
    "Epochs": 50,
    "Learning rate": 0.001,
    "Betas": [0.9, 0.999],
    "Dataset": "SpeedDating",
    "Optimizer": "Adam",
    "Training batch": 64,
    "Test batch": 512,
    "Hidden size": 256,
    "Dropout ratio": 0.2
    },
) as run:
    train_losses = deque(maxlen=50)
    epochs = 50
    saved_weights = []
    saved_better_weights = []

    for i in range(epochs):
        epoch_train_loss = []

        model.train()
        with tqdm(total=len(train_loader), position=0, leave=True) as pbar:
            for x, y in train_loader:
                x, y = x.to(device), y.to(device)

                optim.zero_grad()
                out = model(x)
                loss = loss_fn(out, y.unsqueeze(-1))
                loss.backward()
                optim.step()

                train_losses.append(loss.item())
                epoch_train_loss.append(loss.item())

                pbar.set_postfix_str(f"Epoch: {i}, Avg. train loss: {sum(train_losses) / len(train_losses)}")
                pbar.update(1)

        print(f"Epoch Train Loss: {sum(epoch_train_loss) / len(epoch_train_loss)}")
        run.log({"Epoch Train Loss": sum(epoch_train_loss) / len(epoch_train_loss)}, step=i)

        # Save weights
        saved_weights.append(model.state_dict().copy())

        all_probs = []  # Store probabilities for ROC AUC
        all_preds = []  # Store binary predictions for F1
        all_labels = []

        correct = 0
        total = 0
        model.eval()
        with torch.inference_mode():
            for x, y in test_loader:
                x, y = x.to(device), y.to(device)
                out = model(x)
                probs = torch.sigmoid(out)
                preds = (probs > 0.5).float()
                
                all_probs.extend(probs.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y.cpu().numpy())

                correct += (preds == y.unsqueeze(1)).sum().item()
                total += y.size(0)

        accuracy = correct / total
        print(f"Test Accuracy: {accuracy}")
        run.log({'Accuracy': accuracy}, step=i)
        
        roc_auc = roc_auc_score(all_labels, all_probs)
        print(f"Test ROC AUC Score: {roc_auc}")
        run.log({'ROC AUC Score': roc_auc}, step=i)

        f1 = f1_score(all_labels, np.round(all_preds))
        print(f"Test F1 Score: {f1}")
        run.log({'F1 Score': f1}, step=i)

        # Save weights above the value
        if (f1 > 0.52):
            saved_better_weights.append(model.state_dict().copy())

In [None]:
def calculate_average_weights(weights):
    average_weights = {}

    for key in weights[0].keys():
        average_weights[key] = torch.zeros_like(weights[0][key])

    for state_dict in weights:
        for key in state_dict.keys():
            average_weights[key] += state_dict[key]

    for key in average_weights.keys():
        average_weights[key] = average_weights[key].float() / len(weights)

    return average_weights

In [None]:
# Initialize a new model using the averaged weights
# This method loads the state dictionary into the model.
# The state dictionary contains the weights of the model.
average_weights_model = SimpleMLP().to(device)
average_weights = calculate_average_weights(saved_weights)
average_weights_model.load_state_dict(average_weights)

all_probs = []
all_preds = []
all_labels = []

correct = 0
total = 0
model.eval()
with torch.inference_mode():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        out = average_weights_model(x)
        probs = torch.sigmoid(out)
        preds = (probs > 0.5).float()

        all_probs.extend(probs.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

        correct += (preds == y.unsqueeze(1)).sum().item()
        total += y.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

roc_auc = roc_auc_score(all_labels, all_probs)
print(f"Test ROC AUC Score: {roc_auc}")

f1 = f1_score(all_labels, np.round(all_preds))
print(f"Test F1 Score: {f1}")

In [None]:
# Initialize a new model using the averaged weights that were above a value
average_weights_model = SimpleMLP().to(device)
average_weights_model.load_state_dict(calculate_average_weights(saved_better_weights))

all_probs = []
all_preds = []
all_labels = []

correct = 0
total = 0
model.eval()
with torch.inference_mode():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        out = average_weights_model(x)
        probs = torch.sigmoid(out)
        preds = (probs > 0.5).float()

        all_probs.extend(probs.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

        correct += (preds == y.unsqueeze(1)).sum().item()
        total += y.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

roc_auc = roc_auc_score(all_labels, all_probs)
print(f"Test ROC AUC Score: {roc_auc}")

f1 = f1_score(all_labels, np.round(all_preds))
print(f"Test F1 Score: {f1}")