In [2]:
# Version without multiprocessing for easier debugging
import os
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from lib import funcs, utils
from lib.dataHandler import DataHandler

In [3]:
# Constants
from lib.mmsbm import MMSBM

train_set = "train.csv"
test_set = "test.csv"
# Number of groups of users
k = 4
# Number of groups of items
l = 7
# Iterations
# To plateau the coefficients the minimum is 600
iterations = 10
seed = 1714
notebook = True

In [4]:
# Initiate the random state
rng = np.random.default_rng(seed)
# Create seeds for each process
seeds = list(rng.integers(low=1, high=10000, size=1))

logger = logging.getLogger("MMSBM")
logging.basicConfig(level=logging.DEBUG if notebook else logging.INFO)
logger.info(f"Running {1} runs of {iterations} iterations.")

# Get data
data_dir = os.path.join(os.getcwd(), "data")
data_handler = DataHandler(data_dir, train_set, test_set)
train, test = data_handler.import_data()
# For later
obs_dict, items_dict, ratings_dict = data_handler.return_dicts()


# Create a few dicts with the relationships
# TODO: think whether initialization with 0 is needed
d0 = {}
d1 = {}
[d0.update({a: list(train[train[:, 0] == a, 1])}) for a in set(train[:, 0])]
[d1.update({a: list(train[train[:, 1] == a, 0])}) for a in set(train[:, 1])]
ratings = sorted(set(train[:, 2]))
r = max(ratings)
p = int(train[:, 0].max())
m = int(train[:, 1].max())

# If, for some reason, there are missing links, we need to fill them:
[d0.update({a: []}) for a in set(range(p)).difference(set(d0.keys()))]
[d1.update({a: []}) for a in set(range(m)).difference(set(d1.keys()))]

INFO:MMSBM:Running 1 runs of 10 iterations.


[]

In [6]:
rng = np.random.default_rng(seed)

# Generate random (but normalized) inits
theta = funcs.normalize_with_d(
    funcs.init_random_array((p + 1, k), rng), d0
)
eta = funcs.normalize_with_d(
    funcs.init_random_array((m + 1, l), rng), d1
)
pr = funcs.normalize_with_self(
    funcs.init_random_array((k + 1, l, r + 1), rng)
)

# Do the work
# We store the prs to check convergence
prs = []
for i in tqdm(range(iterations)):
    # This is the crux of the script; please see funcs.py
    n_theta, n_eta, npr = funcs.update_coefs(
        data=train, ratings=ratings, theta=theta, eta=eta, pr=pr
    )

    # Update with normalization
    theta = funcs.normalize_with_d(n_theta, d0)
    eta = funcs.normalize_with_d(n_eta, d1)
    pr = funcs.normalize_with_self(npr)

    # This can be removed when not debugging
    prs.append(pr)

  0%|                                                                                                                                                                                                                                                                                           | 0/10 [01:23<?, ?it/s]


KeyboardInterrupt: 

In [14]:
likelihood = funcs.compute_likelihood(train, ratings, theta, eta, pr)
rat = funcs.compute_prod_dist(test, theta, eta, pr)

In [15]:
theta.sum(axis=1)

array([0., 1., 1., ..., 1., 1., 8.])

In [16]:
n_theta.sum(axis=1)

array([0., 8., 8., ..., 8., 8., 8.])

In [110]:
aa = pd.read_csv("data/train.csv", header=None)
bb = pd.read_csv("data/test.csv", header=None)

In [100]:
def rename_values(x):
    vals = set(x)
    dict_ = {}
    _ = [dict_.update({b: a}) for (a, b) in zip(range(len(vals)), vals)]
    return [dict_[a] for a in x], dict_

In [101]:
def parse_train_data(df):
    df.iloc[:, 0], obs_dict = rename_values(df.iloc[:, 0])
    df.iloc[:, 1], items_dict = rename_values(df.iloc[:, 1])
    df.iloc[:, 2], ratings_dict = rename_values(df.iloc[:, 2])
    
    return df, obs_dict, items_dict, ratings_dict

In [102]:
aa, obs_dict, items_dict, ratings_dict = parse_train_data(aa)

In [109]:
def parse_test_data(df, obs_dict, items_dict, ratings_dict):
    df.iloc[:, 0] = [obs_dict[a] for a in df.iloc[:, 0]]
    df.iloc[:, 1] = [items_dict[a] for a in df.iloc[:, 1]]
    df.iloc[:, 2] = [ratings_dict[a] for a in df.iloc[:, 2]]
    
    return df

In [111]:
bb = parse_test_data(bb, obs_dict, items_dict, ratings_dict)

In [112]:
bb

Unnamed: 0,0,1,2
0,0,7,2
1,1,2,4
2,2,0,1
3,3,7,2
4,4,1,1
...,...,...,...
4698,4698,6,0
4699,4699,2,0
4700,4700,6,1
4701,4701,5,4


In [18]:
p

4702