<a href="https://colab.research.google.com/github/bsallesp/AnomalyDetection/blob/main/M15_Bruno_Salles_Pereira.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing:

### Unpacking resources and files:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive

In [None]:
drive.mount('/drive')
def getData():
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv('/drive/MyDrive/ColabNotebooks/Mentorama/M15/u.data', sep='\t', names=r_cols,
                          encoding='latin-1')
    m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
    movies = pd.read_csv('/drive/MyDrive/ColabNotebooks/Mentorama/M15/u.item', sep='|', names=m_cols, usecols=range(5),
                         encoding='latin-1')
    movie_ratings = pd.merge(movies, ratings)
    temp = movie_ratings[['movie_id', 'user_id', 'rating']].copy()
    temp = temp.pivot_table(columns='movie_id', index='user_id', values='rating').copy()
    temp.index = ['User_'+str(int(i)) for i in temp.index]
    temp.columns = ['Filme_'+str(int(i)) for i in temp.columns]
    qtd_cols = 80
    R = temp.iloc[:, :qtd_cols]
    l=[]
    for i in range(1, R.shape[0]+1):
        if R.iloc[i-1, ].isnull().sum() >= (qtd_cols - 10):
            l.append(i)
    R = R.drop(["User_"+str(r) for r in l])
    R.index = ['User_'+str(int(i)) for i in range(R.shape[0])]
    return R

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


### Class MatrixFactorization:

In [None]:
import time

class MatrixFactorization():
    
    def __init__(self, dataframe, K, steps, alpha, beta):
        self.df = dataframe
        self.K = K
        self.steps = steps
        self.alpha = alpha
        self.beta = beta
        
    def fit(self, print_ = False):
        t0 = time.time()
        
        R = self.df.values
        N, M = R.shape
        
        #inicio aleatorio
        P = np.random.rand(N,self.K)
        Q = np.random.rand(self.K,M)
        
        lista_erro_step = []
        
        #loop
        for step in range(self.steps):
            
            mse_total_step = 0
            #varrendo todas as entradas da matriz R
            for i in range(len(R)):
                for j in range(len(R[i])):
                    #validando se o valor associado está preenchido
                    if R[i][j] > 0:

                        #calculando o erro:
                        eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                        mse_total_step += (eij)**2
                        #alterando os valores
                        for k in range(self.K):
                            P[i][k] = P[i][k] + self.alpha * ( 2 * eij * Q[k][j] - self.beta * P[i][k])
                            Q[k][j] = Q[k][j] + self.alpha * ( 2 * eij * P[i][k] - self.beta * Q[k][j])
                            
            lista_erro_step.append(mse_total_step)
            
        self.P = P
        self.Q = Q
        self.lista_erro_step = lista_erro_step
        t1 = time.time()
        if print_== True:
          print("Fatoração concluída. Tempo aproximado:", int((t1-t0)/60)+1, 'minuto(s).')
        
    def predict(self):
        return self.P.dot(self.Q)
    
    def print_MSE_steps(self):
        plt.figure(figsize=[15,6])
        plt.title("Custo total por Step", fontsize = 16, fontweight = 'bold')
        plt.xlabel("Step", fontsize = 14, fontweight = 'bold')
        plt.ylabel("Erro", fontsize = 14, fontweight = 'bold')
        plt.plot(range(1, 1+self.steps), self.lista_erro_step, c = 'blue', lw = 2)
        plt.grid()
        plt.show()

### grid_search / make_chess:

In [None]:
def grid_search(train, test, param_grid, print_ = False, clean_ = False):
  from itertools import product
  from IPython.display import clear_output 
  results = []
  count = 0

  for param in product(*param_grid):
    
    fat = MatrixFactorization(dataframe = pd.DataFrame(train), K = param[0], steps = param[1], alpha = param[2], beta = param[3])
    fat.fit()
    mse = make_chess(val, fat.predict(), mse_ = True)
    results.append([param, mse])
    if print_ == True:
      print(f"Lap {count + 1} / {total_loops}:")
      print(f"K = {param[0]}")
      print(f"steps = {param[1]}")
      print(f"alpha = {param[2]}")
      print(f"beta = {param[3]}")
      print(f"MSE = {mse}")
      print("_" * 75)
      count = count + 1

    if clean_ == True:
      clear_output()

  
  
  return results

In [None]:
# Select only pred position cells, and return new dataframe with pred position only:
def make_chess(df_true, df_pred, print_ = False, mse_ = True, dfs_ = False):
  # new df instances
  clean_df_true = np.array(df_true.copy())
  clean_df_pred = np.array(df_pred.copy())
  count = 0
  
  for i in range(0, len(df_true[0])):
    for j in range(0, len(df_true[1])):
      if df_true[i][j] == 0:
        clean_df_pred[i][j] = 0
        clean_df_true[i][j] = 0
        count = count + 1

  if print_ == True:
    total_count_pred = (df_true.shape[0] * df_true.shape[1]) - count
    total_count_original = df_true.shape[0] * df_true.shape[1]
    print("Total de instancias no dataset original: ", total_count_original)
    print("Total de instancias zeradas: ", count)
    print("Total de predicoes em pred e val: ", total_count_pred)
    print("% de predicoes em pred e val em comparacao ao dataset original:", round(((total_count_pred / total_count_original * 100)-100)*-1, 2),"%")
  
  if dfs_ == True:
    return clean_df_true, clean_df_pred

  if mse_ == True:
    from sklearn import metrics
    mse = metrics.mean_absolute_error(clean_df_true, clean_df_pred)
    return mse

### train_test_split:

In [None]:
def train_test_split(ratings, qtd):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=qtd, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    return train, test

### Data splitting:

In [None]:
R = getData()
print(R.shape)
R.sample(3)

(367, 80)


Unnamed: 0,Filme_1,Filme_2,Filme_3,Filme_4,Filme_5,Filme_6,Filme_7,Filme_8,Filme_9,Filme_10,Filme_11,Filme_12,Filme_13,Filme_14,Filme_15,Filme_16,Filme_17,Filme_18,Filme_19,Filme_20,Filme_21,Filme_22,Filme_23,Filme_24,Filme_25,Filme_26,Filme_27,Filme_28,Filme_29,Filme_30,Filme_31,Filme_32,Filme_33,Filme_34,Filme_35,Filme_36,Filme_37,Filme_38,Filme_39,Filme_40,Filme_41,Filme_42,Filme_43,Filme_44,Filme_45,Filme_46,Filme_47,Filme_48,Filme_49,Filme_50,Filme_51,Filme_52,Filme_53,Filme_54,Filme_55,Filme_56,Filme_57,Filme_58,Filme_59,Filme_60,Filme_61,Filme_62,Filme_63,Filme_64,Filme_65,Filme_66,Filme_67,Filme_68,Filme_69,Filme_70,Filme_71,Filme_72,Filme_73,Filme_74,Filme_75,Filme_76,Filme_77,Filme_78,Filme_79,Filme_80
User_299,4.0,,1.0,,,,,4.0,2.0,,,,,,,,,,,,,3.0,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,3.0,,5.0,,,,,,,,3.0,,,4.0,,,,,3.0,,,,,,,,4.0,
User_316,4.0,3.0,2.0,,,2.0,,,,,,5.0,,3.0,,,4.0,,,,,,,3.0,,,,3.0,4.0,,,,,,,,,,,,,,,,4.0,,4.0,,,5.0,,,,,,5.0,,,,,,,,,,,,,,2.0,,,,,,3.0,,,3.0,
User_341,5.0,,,4.0,,,4.0,5.0,,,4.0,,,,5.0,,,,,,2.0,,,,2.0,,,5.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,,,,,,4.0,,,,,,,,,,4.0,,,5.0,3.0,5.0,,,,,,,,5.0,


In [None]:
ratings = R.fillna(0).values
ratings

array([[5., 3., 4., ..., 1., 4., 4.],
       [4., 3., 0., ..., 0., 3., 2.],
       [4., 0., 0., ..., 0., 3., 0.],
       ...,
       [4., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 5., 2.]])

In [None]:
train, test = train_test_split(ratings, qtd = 2)
train, val = train_test_split(train, qtd = 2)

In [None]:
train

array([[5., 3., 4., ..., 1., 4., 4.],
       [4., 0., 0., ..., 0., 3., 2.],
       [4., 0., 0., ..., 0., 3., 0.],
       ...,
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 2.]])

In [None]:
val

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.]])

In [None]:
test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Model training:

### Final Results / best params:

#### Params:

In [None]:
# Set de parametros:
K = np.arange(1, 5, 1)
steps = np.arange(1, 5, 2)
alpha = np.arange(0.0001, 0.0005, 0.0001)
beta = np.arange(0.0001, 0.0005, 0.0001)
total_loops = len(K) * len(steps) * len(alpha) * len(beta)
param_grid = {}
param_grid = K, steps, alpha, beta
print(len(steps))
print(len(alpha))
print(len(beta))
print(total_loops)
print(param_grid)

2
4
4
128
(array([1, 2, 3, 4]), array([1, 3]), array([0.0001, 0.0002, 0.0003, 0.0004]), array([0.0001, 0.0002, 0.0003, 0.0004]))


#### Grid search run:

In [None]:
results = grid_search(train, val, param_grid)

## Best results:

In [None]:
results1 = results.copy()

In [None]:
results1 = pd.DataFrame(results1, columns=['params','mse'])

In [None]:
results1.sort_values(by='mse').head(10)

Unnamed: 0,params,mse
3,"(1, 1, 0.0001, 0.0004)",0.264825
1,"(1, 1, 0.0001, 0.0002)",0.276323
5,"(1, 1, 0.0002, 0.0002)",0.282097
9,"(1, 1, 0.00030000000000000003, 0.0002)",0.29115
2,"(1, 1, 0.0001, 0.00030000000000000003)",0.294133
10,"(1, 1, 0.00030000000000000003, 0.0003000000000...",0.297029
6,"(1, 1, 0.0002, 0.00030000000000000003)",0.300163
0,"(1, 1, 0.0001, 0.0001)",0.300311
12,"(1, 1, 0.0004, 0.0001)",0.310252
19,"(1, 3, 0.0001, 0.0004)",0.310861


In [None]:
results2 = results1.sort_values(by='mse')

In [None]:
results2.mse[1]

0.27632346207146236

In [None]:
fat = MatrixFactorization(pd.DataFrame(train), 1, 1, 0.0001, 0.0002)
fat.fit()
pred_final = fat.predict()

In [None]:
final_mse = make_chess(test, pred_final, print_=True)

Total de instancias no dataset original:  29360
Total de instancias zeradas:  6240
Total de predicoes em pred e val:  23120
% de predicoes em pred e val em comparacao ao dataset original: 21.25 %


## Final result:

In [None]:
results2.mse[1], final_mse

(0.27632346207146236, 0.285173715730375)