In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns; sns.set_theme(color_codes=True)
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from gensim.models import Word2Vec
import pickle
from tqdm import tqdm

In [2]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(
    filepath_or_buffer=csv_file_path,
    usecols=['userID', 'assessmentItemID', 'answerCode', 'Timestamp'],
    dtype = {'answerCode': 'int8'},
)
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [3]:
def get_svd(df, SVD_DIM=5):
    correct_svd_df = df.groupby(['userID', 'assessmentItemID']).answerCode.sum().reset_index()
    pivot_svd_df = correct_svd_df.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0)

    svd = TruncatedSVD(n_components=SVD_DIM)
    svd_question = svd.fit_transform(pivot_svd_df.T)

    svd_q_dict = dict()
    for item, latent in zip(pivot_svd_df.T.index, svd_question):
        svd_q_dict[item] = latent

    svd = TruncatedSVD(n_components=SVD_DIM)
    svd_user = svd.fit_transform(pivot_svd_df)

    svd_u_dict = dict()
    for user, latent in zip(pivot_svd_df.index, svd_user):
        svd_u_dict[user] = latent
        
    return svd_u_dict, svd_q_dict


def get_lda(df, LDA_DIM=5):
    lda_correct_df = df[df['answerCode'] == 1].copy()
    lda_mat = lda_correct_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
    lda_mat = lda_mat.astype(np.uint8)
    lda_mat = lda_mat.unstack(fill_value=0)

    lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
    lda_correct_npy = lda.fit_transform(lda_mat)
    
    lda_wrong_df = df[df['answerCode'] == 0].copy()
    lda_wrong_mat = lda_wrong_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
    lda_wrong_mat = lda_wrong_mat.astype(np.uint8)
    lda_wrong_mat = lda_wrong_mat.unstack(fill_value=0)

    lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
    lda_wrong_npy = lda.fit_transform(lda_wrong_mat)
    
    return lda_correct_npy, lda_wrong_npy

def get_word2vec(df, EMB_DIM=10):
    emb_correct_df = df[df['answerCode'] == 1].copy()
    emb_correct_df = emb_correct_df.groupby('userID')['assessmentItemID'].apply(list)
    sentences = list(emb_correct_df.values)
    model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

    correct_question2vec = {}
    for content in model.wv.key_to_index.keys():
        correct_question2vec[content] = model.wv.get_vector(content)

    emb_wrong_df = df[df['answerCode'] == 0].copy()
    emb_wrong_df = emb_wrong_df.groupby('userID')['assessmentItemID'].apply(list)
    sentences = list(emb_wrong_df.values)
    model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

    wrong_question2vec = {}
    for content in model.wv.key_to_index.keys():
        wrong_question2vec[content] = model.wv.get_vector(content)
    
    return correct_question2vec, wrong_question2vec

In [4]:
def valid_update(df, cv_num):
    users_file_path = os.path.join(data_dir, f'cv1_users.pickle')
    with open(users_file_path,'rb') as f:
        users = pickle.load(f)
    train_users = users['train_users']
    test_users = users['test_users']

    valid_cond1 = df['userID'].isin(train_users) == False
    valid_cond2 = df['userID'].isin(test_users) == False
    cv_idx = df[valid_cond1&valid_cond2].groupby('userID', as_index=False).nth(-cv_num).index
    valid_idx = df[valid_cond1&valid_cond2].groupby('userID').tail(cv_num).index
    
    df['cv_idx'] = False
    df['is_valid'] = False
    
    df.loc[cv_idx, 'cv_idx'] = True
    df.loc[valid_idx, 'is_valid'] = True
    
    return df

In [5]:
cv_len = 5

for cv_num in range(1, 1+cv_len):
    df = valid_update(df, cv_num=cv_num)

    cond1 = df['is_valid'] == True
    cond2 = df['answerCode'] == -1
    test_df = df[cond1|cond2].copy()
    train_df = df[~(cond1|cond2)].copy()

    print(train_df.userID.nunique(), len(df[cond1]), len(df[cond2]), len(train_df) + len(test_df))

    svd_u_dict, svd_q_dict = get_svd(train_df)
    with open(f'./assets3/cv{cv_num}/svd_question.pickle','wb') as f:
        pickle.dump(svd_q_dict, f)
    with open(f'./assets3/cv{cv_num}/svd_user.pickle','wb') as f:
        pickle.dump(svd_u_dict, f)

    correct_question2vec, wrong_question2vec = get_word2vec(train_df)
    with open(f'./assets3/cv{cv_num}/word2vec_correct_question.pickle','wb') as f:
        pickle.dump(correct_question2vec, f)
    with open(f'./assets3/cv{cv_num}/word2vec_wrong_question.pickle','wb') as f:
        pickle.dump(wrong_question2vec, f)

7442 2007 744 2526700
7442 4014 744 2526700
7442 6021 744 2526700
7442 8028 744 2526700
7442 10035 744 2526700
