# Matrix Factorization

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import time
from datetime import datetime
from numpy.linalg import svd
from sklearn.decomposition import TruncatedSVD

import csv
import os
import sys

## Utils

In [2]:
DATA_PATH = '/opt/ml/input/data/train_dataset/'
TRAIN_DATA = 'cv_train_data_FE.pkl'
VALID_DATA = 'cv_valid_data_FE.pkl'
TEST_DATA = 'cv_test_data_FE.pkl'
K=40

In [3]:
def drop_unnamed(df):
    # Drop index column in df:
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
        print("drop Unnamed: 0 column")
    return df

In [4]:
def convert_time(s):
    # Convert datetime64 to int
    timestamp = time.mktime(datetime.strptime(str(s), '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

In [5]:
def csv_to_pkl():
    # Convert csv dataset to pkl dataset for performance.    
    if TRAIN_DATA[-3:]=='csv' : train_df.to_pickle(os.path.join(DATA_PATH,'cv_train_data_FE.pkl'))
    if VALID_DATA[-3:]=='csv' : valid_df.to_pickle(os.path.join(DATA_PATH,'cv_valid_data_FE.pkl'))
    if TEST_DATA[-3:]=='csv' : test_df.to_pickle(os.path.join(DATA_PATH,'cv_test_data_FE.pkl'))
# csv_to_pkl()

In [6]:
def cache_time(df, name):
    # Convert datetime64 to int, then save it.
    if df['Timestamp'].dtype == object:
        print(df['Timestamp'].dtype, df['Timestamp'].head(1))
        print("Processing Timestamp...")
        df['Timestamp'] = df['Timestamp'].apply(convert_time)
        print("Processing Timestamp done")
        df.to_pickle(os.path.join(DATA_PATH,f'{name}.pkl'))
    return df
# train_df = cache_time(train_df, 'cv_train_data_FE')
# valid_df = cache_time(valid_df, 'cv_valid_data_FE')
# test_df = cache_time(test_df, 'cv_test_data_FE')

## Get Data and Concat Datasets

In [7]:
# Load dataset from disk
get_train_data = pd.read_csv if TRAIN_DATA[-3:]=='csv' else pd.read_pickle
get_valid_data = pd.read_csv if VALID_DATA[-3:]=='csv' else pd.read_pickle
get_test_data = pd.read_csv if TEST_DATA[-3:]=='csv' else pd.read_pickle

train_df = get_train_data(os.path.join(DATA_PATH, TRAIN_DATA))
valid_df = get_valid_data(os.path.join(DATA_PATH, VALID_DATA))
test_df = get_test_data(os.path.join(DATA_PATH, TEST_DATA))

In [8]:
original_train = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'))
original_test = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'))

In [9]:
# Check dataset length
len(original_train),len(original_test),len(original_train) == len(train_df) + len(valid_df),len(original_test)==len(test_df)

(2266586, 260114, True, True)

In [10]:
train_df.columns, train_df.columns == valid_df.columns

(Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
        'KnowledgeTag', 'hour', 'dow', 'elapsed', 'grade', 'mid',
        'problem_number', 'test_mean', 'test_sum', 'tag_mean', 'tag_sum',
        'ass_mean', 'ass_sum', 'prb_mean', 'prb_sum', 'hour_mean', 'hour_sum',
        'dow_mean', 'dow_sum', 'tag_elp', 'tag_elp_o', 'tag_elp_x', 'ass_elp',
        'ass_elp_o', 'ass_elp_x', 'prb_elp', 'prb_elp_o', 'prb_elp_x',
        'user_correct_answer', 'user_total_answer', 'user_acc', 'Grade_o',
        'GradeCount', 'GradeAcc', 'GradeElp', 'GradeMElp', 'problem_count',
        'tag_count', 'RepeatedTime', 'prior_KnowledgeTag_frequency',
        'problem_position', 'solve_order', 'retest', 'solved_disorder',
        'last_problem', 'answer_delta', 'tag_delta', 'test_delta',
        'assess_delta', 'left_asymptote', 'elo_prob', 'cum_correct'],
       dtype='object'),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  Tr

In [11]:
test_df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'hour', 'dow', 'elapsed', 'grade', 'mid',
       'problem_number', 'test_mean', 'test_sum', 'tag_mean', 'tag_sum',
       'ass_mean', 'ass_sum', 'prb_mean', 'prb_sum', 'hour_mean', 'hour_sum',
       'dow_mean', 'dow_sum', 'tag_elp', 'tag_elp_o', 'tag_elp_x', 'ass_elp',
       'ass_elp_o', 'ass_elp_x', 'prb_elp', 'prb_elp_o', 'prb_elp_x',
       'user_correct_answer', 'user_total_answer', 'user_acc', 'Grade_o',
       'GradeCount', 'GradeAcc', 'GradeElp', 'GradeMElp', 'problem_count',
       'tag_count', 'RepeatedTime', 'prior_KnowledgeTag_frequency',
       'problem_position', 'solve_order', 'retest', 'solved_disorder',
       'last_problem', 'answer_delta', 'tag_delta', 'test_delta',
       'assess_delta', 'left_asymptote', 'elo_prob'],
      dtype='object')

In [12]:
# Drop unavailable columns
unavailable = ['cum_correct', 'answer_delta', 'tag_delta', 'test_delta', 'assess_delta']
train_df = train_df.drop(columns=unavailable, errors='ignore')
valid_df = valid_df.drop(columns=unavailable, errors='ignore')
test_df = test_df.drop(columns=unavailable, errors='ignore')

In [13]:
# Check all the dataset's columns are same.
train_df.columns == valid_df.columns, valid_df.columns == test_df.columns, test_df.columns == train_df.columns

(array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True]),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True]),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  

In [14]:
# Add temporal Index and Table column for spliting.
# 0: train, 1: valid, 2: test
train_df['table'] = 0
valid_df['table'] = 1
test_df['table'] = 2

train_df['temp_idx'] = train_df.index
valid_df['temp_idx'] = valid_df.index
test_df['temp_idx'] = test_df.index

In [15]:
# Get masked test rows
masked_test_df = test_df[test_df['answerCode'] == -1]
unmasked_test_df = test_df[test_df['answerCode'] != -1]
len(masked_test_df)

744

In [16]:
# Concat all datasets without masked rows.
whole_df = pd.concat([train_df, valid_df, unmasked_test_df], ignore_index=True)

In [17]:
# Is SVD avaliable? Unknown userID and Unknown assessmentItemID in test dataset will cause problem.
# print("problem? : ")
# masked_userID = masked_test_df['userID'].unique()
# masked_assessmentItemID = masked_test_df['assessmentItemID'].unique()
# if any(user not in whole_df['userID'].unique() for user in tqdm(masked_userID)) or any(user not in whole_df['assessmentItemID'].unique() for user in tqdm(masked_assessmentItemID)) : 
#     raise RuntimeError(f'TSVD Feature Unavailable.')
# print("nope")

In [18]:
# Concat all df as one.
whole_df = pd.concat([whole_df, masked_test_df], ignore_index=True)

In [19]:
len(whole_df), len(whole_df) == len(train_df) + len(valid_df) + len(test_df)

(2526700, True)

In [20]:
len(whole_df['userID'].unique()), len(whole_df['assessmentItemID'].unique())

(7442, 9454)

## Truncated SVD
- 가장 중요한 K개의 잠재요소만 가져와서 Feature로 삼는 방법

In [21]:
def save_TSVD(k, df):
    # Add lf function in Dataframe.

    # Careate Pivot Table
    print("Create Pivot Table")
    ans_df = df[df['answerCode'] != -1].groupby(['userID', 'assessmentItemID']).answerCode.sum().reset_index()
    pivot_df = ans_df.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0)

    # fit SVD    
    print("fit SVD")
    svd2 = TruncatedSVD(n_components=k)
    svd2.fit(pivot_df)
    user_hid = svd2.transform(pivot_df)
    print("유저 잠재요인 : ", len(user_hid), len(user_hid[0]))
    svd2.fit(pivot_df.T)
    problems_hid = svd2.transform(pivot_df.T)
    print("문제 잠재요인 : ", len(problems_hid), len(problems_hid[0]))
    users = pivot_df.index.values
    problems = pivot_df.columns.values

    # 유저 잠재 요인 - U
    user_latent_factor = {}

    for i, user in enumerate(users):
        user_latent_factor[user] = user_hid[i]

    # 문제 잠재 요인 - V
    problems_latent_factor = {}

    for i, problem in enumerate(problems):
        problems_latent_factor[problem] = problems_hid[i]

    print("assessmentItemID mapping")
    nested_problems_lf = df['assessmentItemID'].map(problems_latent_factor).values
    problem_lf = np.concatenate(nested_problems_lf, 0).reshape(-1, len(nested_problems_lf[0]))
    # Add feature
    print("Add feature")
    df[[f'assessmentItemID_lf{i + 1}' for i in tqdm(range(k))]] = problem_lf
    return df



In [22]:
# truncatedSVD
new_df = save_TSVD(K, whole_df)
new_df

Create Pivot Table
fit SVD
유저 잠재요인 :  7442 40
문제 잠재요인 :  9454 40
assessmentItemID mapping
Add feature


  0%|          | 0/40 [00:00<?, ?it/s]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,hour,dow,elapsed,grade,...,assessmentItemID_lf31,assessmentItemID_lf32,assessmentItemID_lf33,assessmentItemID_lf34,assessmentItemID_lf35,assessmentItemID_lf36,assessmentItemID_lf37,assessmentItemID_lf38,assessmentItemID_lf39,assessmentItemID_lf40
0,0,A060001001,A060000001,1,1585009031,7224,0,1,0.0,6,...,0.187618,0.105149,0.089573,0.112876,0.034190,1.387534,-0.477091,0.010622,-0.637065,-1.116152
1,0,A060001002,A060000001,1,1585009034,7225,0,1,3.0,6,...,0.189153,0.069678,0.095991,0.108613,0.036473,1.353113,-0.455818,0.008808,-0.701690,-1.191159
2,0,A060001003,A060000001,1,1585009042,7225,0,1,8.0,6,...,0.218003,0.211369,0.035524,0.105588,0.023566,1.401665,-0.460065,0.097230,-0.542106,-1.003339
3,0,A060001004,A060000001,1,1585009049,7225,0,1,7.0,6,...,0.171232,0.112103,0.011979,0.137871,0.021363,1.368139,-0.464908,0.005286,-0.630683,-1.098972
4,0,A060001005,A060000001,1,1585009056,7225,0,1,7.0,6,...,0.117933,-0.010957,0.008431,0.148195,0.035449,1.288530,-0.435120,-0.023752,-0.634095,-1.050310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7395,A040122005,A040000122,-1,1599530720,10615,2,1,2.0,4,...,-0.050888,-0.443437,-0.862972,-2.402245,0.919398,-0.045741,-0.734190,-0.077270,-0.013150,-0.664047
2526696,7404,A030111005,A030000111,-1,1602582558,7636,9,1,107.0,3,...,-0.167530,0.367423,-0.258267,0.378479,-0.139851,0.137944,-0.224567,-0.084782,0.717462,0.150860
2526697,7416,A050193004,A050000193,-1,1601779481,10402,2,6,24.0,5,...,0.567810,0.538026,-0.776257,-0.397721,-1.008641,0.142540,0.355269,6.712777,0.723459,-0.708221
2526698,7417,A050193004,A050000193,-1,1599397755,10402,13,6,21.0,5,...,0.567810,0.538026,-0.776257,-0.397721,-1.008641,0.142540,0.355269,6.712777,0.723459,-0.708221


## Split dataframe and Save it.

In [23]:
train_df = whole_df[whole_df['table']==0].sort_values(by='temp_idx').drop(columns=['table', 'temp_idx']).to_pickle(os.path.join(DATA_PATH,f'cv_train_data_FE_MF{K}.pkl'))
valid_df = whole_df[whole_df['table']==1].sort_values(by='temp_idx').drop(columns=['table', 'temp_idx']).to_pickle(os.path.join(DATA_PATH,f'cv_valid_data_FE_MF{K}.pkl'))
test_df = whole_df[whole_df['table']==2].sort_values(by='temp_idx').drop(columns=['table', 'temp_idx']).to_pickle(os.path.join(DATA_PATH,f'cv_test_data_FE_MF{K}.pkl'))



## SVD
memory overflow! unavaliable!  

In [24]:
# untruncated svd
raise RuntimeError("Unavailable!")
U, sigma, VT = svd(np.array(test_pivot_df))

RuntimeError: Unavailable!

In [None]:
smat= np.zeros((len(U),len(VT)))
smat[:len(U),:len(U)] = np.diag(sigma)


In [None]:
users = test_pivot_df.index.values
problems = test_pivot_df.columns.values

# 유저 잠재 요인 - U
user_latent_factor = {}

# scaling
U = U @ np.diag(sigma)

for i, user in enumerate(users):
    user_latent_factor[user] = U[i]

# 문제 잠재 요인 - V
problems_latent_factor = {}

# scaling
# VT = np.diag(sigma) @ VT
VT = smat @ VT

for i, problem in enumerate(problems):
    problems_latent_factor[problem] = VT.T[i]

In [9]:
# 문제 잠재 요인들을 각 문제에 mapping
nested_problems_lf = test_svd_df['assessmentItemID'].map(problems_latent_factor).values


TypeError: 'TruncatedSVD' object is not callable

In [None]:
# nested numpy array를 하나의 numpy array로 변환
problem_lf = np.concatenate(nested_problems_lf, 0).reshape(-1, nested_problems_lf[0])


In [None]:

# # 문제 잠재 요인을 기존 pandas DataFrame에 추가
test_fe_df[[f'assessmentItemID_lf{i + 1}' for i in range(3)]] = problem_lf

In [None]:
test_fe_df