In [None]:
!pip install fastFM

Collecting fastFM
  Downloading fastFM-0.2.10.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fastFM
  Building wheel for fastFM (setup.py) ... [?25l[?25hdone
  Created wheel for fastFM: filename=fastFM-0.2.10-cp310-cp310-linux_x86_64.whl size=591530 sha256=df5423e14174cc32773b5e07138193659221a0521f89080632355449878bdd51
  Stored in directory: /root/.cache/pip/wheels/93/92/52/2da7997fcb7a7ce9042ff3b33836ef0c2fd47aa95382d7a113
Successfully built fastFM
Installing collected packages: fastFM
Successfully installed fastFM-0.2.10


In [None]:
import pandas as pd
import gzip
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import defaultdict
import numpy as np
from fastFM import als, sgd, mcmc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler,label_binarize
from sklearn.metrics import roc_auc_score,accuracy_score, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,classification_report
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
clean_dataset_path = "/content/drive/MyDrive/AIHW/AS2/caius_data_clean.csv"

In [None]:
df = None
if not os.path.exists(clean_dataset_path):
  df = None
  print("Empty Dataset")
else:
  df = pd.read_csv(clean_dataset_path)

In [None]:
average_height = df['height'].dropna().mean()
df['height'] = df['height'].fillna(average_height)
df['height_mul'] = df['height']/10
average_rating = df['rating'].dropna().mean()
df['rating'] = df['rating'].fillna(average_rating)
df['size_mul'] = df['size']*2

In [None]:
encoder = OneHotEncoder()
categorical_data = encoder.fit_transform(df[['user_id', 'item_id']])
scaler = StandardScaler()
numerical_data = scaler.fit_transform(df[['size','rating','height']])
# Increase all size_mul by 3 times
for i in range(len(numerical_data)):
  numerical_data[i][0] = numerical_data[i][0]*4

In [None]:
numerical_data

array([[ 0.6197251 ,  0.63482249,  1.01150603],
       [-0.08658481,  0.63482249,  0.25928317],
       [-2.91182445,  0.63482249, -0.4929397 ],
       ...,
       [-1.49920463, -2.16289696,  1.01150603],
       [ 1.32603501,  0.63482249,  0.25928317],
       [ 1.32603501,  0.63482249,  0.25928317]])

In [None]:
# Optimized data preparation
# Define the resampling strategy
#X = hstack([categorical_data, height_data])
X = hstack([categorical_data,numerical_data])
fit_mapping = {'small': 0, 'fit': 1, 'large': 2}
y = df['fit'].map(fit_mapping).values
over = SMOTE(sampling_strategy={0: int(len(y) * 0.5), 2: int(len(y) * 0.5)}, k_neighbors=3)
under = RandomUnderSampler(sampling_strategy={1: int(len(y) * 0.4)})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# Define the pipeline
# pipeline = Pipeline(steps=[('o', over), ('u', under)])
pipeline = Pipeline(steps=[('o', over)])
y_binary = label_binarize(y, classes=np.unique(y),neg_label=-1)
# Apply the pipeline to your data
X_train, y_train = pipeline.fit_resample(X_train, y_train)


In [None]:
#X = hstack([categorical_data, height_data])
#Size only
X = categorical_data
# Binarize target variable for One-vs-Rest strategy
fit_mapping = {'small': 0, 'fit': 1, 'large': 2}
y = df['fit'].map(fit_mapping).values
y_binary = label_binarize(y, classes=np.unique(y),neg_label=-1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.1)

In [None]:
class FMClassifier(als.FMClassification):
    def fit(self, X, y, *args):
        y = y.copy()
        y[y == 0] = -1
        return super(FMClassifier, self).fit(X, y, *args)

    def predict_proba(self, X):
        probs = super(FMClassifier, self).predict_proba(X)
        return np.tile(probs, 2).reshape(2, probs.shape[0]).T

ovr_classifier = OneVsRestClassifier(FMClassifier(n_iter=150,init_stdev=0.2, rank=4, l2_reg_w=0.3, l2_reg_V=0.3), n_jobs=-1)

# Initialize the factorization machine model
# fm_model = als.FMClassification(n_iter=25, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.1)

# Apply One-vs-Rest strategy for multiclass prediction
# ovr_classifier = OneVsRestClassifier(fm_model)

# Fit the model
ovr_classifier.fit(csr_matrix(X_train), y_train)



In [None]:
# Predict the probabilities
y_pred_prob = ovr_classifier.predict_proba(csr_matrix(X_test))
y_pred = ovr_classifier.predict(X_test)
# Calculate AUC for each class and average
auc_scores = roc_auc_score(y_test, y_pred_prob, average='macro', multi_class='ovr')

print(f'AUC Scores: {auc_scores}')

AUC Scores: 0.7110919100222474


In [None]:
small_count=0
fit_count=0
large_count=0
for i in range(len(y_pred_prob)):
  if y_pred_prob[i][0]>y_pred_prob[i][1] and y_pred_prob[i][0]>y_pred_prob[i][2]:
    small_count+=1
  elif y_pred_prob[i][1]>y_pred_prob[i][0] and y_pred_prob[i][1]>y_pred_prob[i][2]:
    fit_count+=1
  else:
    large_count+=1

print(small_count)
print(fit_count)
print(large_count)

2265
15011
1979


In [None]:
small_acc = 0
fit_acc = 0
large_acc = 0
for i in range(len(y_pred)):
  if y_pred[i]==0 and y_test[i]==0:
    small_acc+=1
  elif y_pred[i]==1 and y_test[i]==1:
    fit_acc+=1
  elif y_pred[i]==2 and y_test[i]==2:
    large_acc+=1

print(small_acc)
print(fit_acc)
print(large_acc)


867
11971
716


In [None]:
true_small = 0
true_fit = 0
true_large = 0
for i in range(len(y_pred)):
  if y_test[i]==0:
    true_small+=1
  elif y_test[i]==1:
    true_fit+=1
  elif y_test[i]==2:
    true_large+=1

In [None]:
report = classification_report(y_test, y_pred, target_names=['Small', 'Fit', 'Large'])

print(report)

              precision    recall  f1-score   support

       Small       0.38      0.34      0.36      2539
         Fit       0.80      0.84      0.82     14307
       Large       0.36      0.30      0.33      2409

    accuracy                           0.70     19255
   macro avg       0.51      0.49      0.50     19255
weighted avg       0.69      0.70      0.70     19255



In [None]:
# In the top 73% data, how many are with label 1
def getTopNPositive(percentile,label,prob_list,test_data):
  # Convert inputs to numpy arrays if they aren't already
    prob_list = np.array(prob_list)
    test_data = np.array(test_data)

    # Step 1: Sort predictions for the specified label and get indices
    sorted_indices = np.argsort(prob_list[:, label])[::-1]

    # Step 2: Select top N% of the records
    top_n_percent = int(len(prob_list) * (percentile / 100))
    selected_indices = sorted_indices[:top_n_percent]

    # Step 3: Fetch the corresponding true labels
    selected_true_labels = test_data[selected_indices]

    # Step 4: Count how many of these are actually labeled as the specified label
    correct_predictions = np.sum(selected_true_labels == label)

    return correct_predictions/top_n_percent

In [None]:
getTopNPositive(73,1,y_pred_prob,y_test)

0.8032157085941947

In [None]:
prob_list = np.array(y_pred_prob)
test_data = np.array(y_test)
sorted_indices = np.argsort(prob_list[:,1])[::-1]
top_n_percent = int(len(prob_list) * (73 / 100))
selected_indices = sorted_indices[top_n_percent:]
selected_labels = test_data[selected_indices]
selected_prob  = prob_list[selected_indices]
new_label = []
for i,_ in enumerate(selected_prob):
  large_prob = selected_prob[i][2]
  small_prob = selected_prob[i][0]
  if large_prob >= small_prob:
    new_label.append(2)
  else:
    new_label.append(0)
print(classification_report(selected_labels, new_label, target_names=['Small', 'Fit', 'Large']))

              precision    recall  f1-score   support

       Small       0.36      0.81      0.50      1166
         Fit       0.00      0.00      0.00      3017
       Large       0.34      0.85      0.48      1016

    accuracy                           0.35      5199
   macro avg       0.23      0.55      0.33      5199
weighted avg       0.15      0.35      0.21      5199



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Now, find the indices where the true label is not 'fit' (1) but the predicted label is 'fit' (1)
incorrect_fit_indices = np.where((y_test != 1) & (y_pred == 1))

# Extract the probabilities for these specific cases
# This gives you the probabilities assigned to 'fit' for the wrongly predicted samples
incorrect_fit_probabilities = y_pred_prob[incorrect_fit_indices]
incorrect_labels = y_test[incorrect_fit_indices]



In [None]:
incorrect_labels

array([2, 0, 2, ..., 0, 2, 2])

In [None]:
incorrect_fit_probabilities

array([[0.20672657, 0.54707572, 0.24619771],
       [0.30585778, 0.35216067, 0.34198155],
       [0.23117969, 0.55653585, 0.21228445],
       ...,
       [0.28701327, 0.47474297, 0.23824376],
       [0.21918853, 0.4986673 , 0.28214417],
       [0.21588637, 0.44564624, 0.3384674 ]])

In [None]:
# Assuming y_test is your true labels and y_pred_labels is your predicted labels
conf_matrix = confusion_matrix(y_test, y_pred)

# The diagonal elements of the confusion matrix correspond to correct predictions (true positives)
# For each class, divide the true positive count by the total actual instances of that class (the sum of the corresponding row in the confusion matrix)
class_accuracies = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# Now class_accuracies[i] will give you the accuracy for class i
for i, accuracy in enumerate(class_accuracies):
    print(f'Accuracy for class {i}: {accuracy:.2f}')

Accuracy for class 0: 0.39
Accuracy for class 1: 0.70
Accuracy for class 2: 0.45


In [None]:
def getAcc(y_t,y_p):
  conf_matrix = confusion_matrix(y_t, y_p)
  class_accuracies = conf_matrix.diagonal() / conf_matrix.sum(axis=0)
  return class_accuracies

In [None]:
getAcc(y_test,y_pred)

array([0.3162446 , 0.79555059, 0.31588032])

In [None]:
t = np.arange(0.0, 0.5, 0.005)
fit_thresh = np.arange(0.3,0.8,0.01)
best_t = 1.0
best_fit_t = 0.0
best_acc = 0
for fit_thresh in fit_thresh:
  for thresh in t:
    new_labels = []
    for i in range(len(y_pred_prob)):
      large_prob = y_pred_prob[i][2]
      fit_prob = y_pred_prob[i][1]
      small_prob = y_pred_prob[i][0]
      # Originally fit
      if fit_prob > small_prob and fit_prob > large_prob:
        if fit_prob > fit_thresh:
          new_labels.append(1)
          continue
        if large_prob > small_prob:
          if large_prob + thresh > fit_prob:
            new_labels.append(2)
          else:
            new_labels.append(1)
        else:
          if small_prob + thresh > fit_prob:
            new_labels.append(0)
          else:
            new_labels.append(1)
      else:
        if large_prob > small_prob:
          new_labels.append(2)
        else:
          new_labels.append(0)
    acc = getAcc(y_test,new_labels)
    print(acc)
    print(f'Threshold: {thresh}, Accuracy: {acc}', f'Fit Threshold: {fit_thresh}')

In [None]:
for thresh in t:
  new_labels = []
  for i in range(len(incorrect_fit_probabilities)):
    large_prob = incorrect_fit_probabilities[i][2]
    fit_prob = incorrect_fit_probabilities[i][1]
    small_prob = incorrect_fit_probabilities[i][0]
    if large_prob > small_prob:
      if large_prob * thresh > fit_prob:
        new_labels.append(2)
      else:
        new_labels.append(1)
    else:
      if small_prob * thresh > fit_prob:
        new_labels.append(0)
      else:
        new_labels.append(1)
  acc = accuracy_score(incorrect_labels, new_labels)
  if acc > best_acc:
    best_t = thresh
    best_acc = acc
    print(f'Threshold: {thresh}, Accuracy: {acc}')


Threshold: 1.05, Accuracy: 0.0655226209048362
Threshold: 1.1, Accuracy: 0.1294851794071763
Threshold: 1.1500000000000001, Accuracy: 0.1829173166926677
Threshold: 1.2000000000000002, Accuracy: 0.2421996879875195
Threshold: 1.2500000000000002, Accuracy: 0.2878315132605304
Threshold: 1.3000000000000003, Accuracy: 0.3408736349453978
Threshold: 1.3500000000000003, Accuracy: 0.37753510140405616
Threshold: 1.4000000000000004, Accuracy: 0.41809672386895474
Threshold: 1.4500000000000004, Accuracy: 0.4539781591263651
Threshold: 1.5000000000000004, Accuracy: 0.484009360374415
Threshold: 1.5500000000000005, Accuracy: 0.516380655226209
Threshold: 1.6000000000000005, Accuracy: 0.5444617784711389
Threshold: 1.6500000000000006, Accuracy: 0.5717628705148206
Threshold: 1.7000000000000006, Accuracy: 0.5873634945397815
Threshold: 1.7500000000000007, Accuracy: 0.6045241809672387
Threshold: 1.8000000000000007, Accuracy: 0.6197347893915757
Threshold: 1.8500000000000008, Accuracy: 0.6333853354134166
Threshold