In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns; sns.set_theme(color_codes=True)
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from gensim.models import Word2Vec
import pickle

In [2]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(
    filepath_or_buffer=csv_file_path,
    usecols=['userID', 'assessmentItemID', 'answerCode'],
    dtype = {'answerCode': 'int8'},
)

In [3]:
def load_train_data(df):
    last_cond = df['userID'] != df['userID'].shift(-1)
    train_df = df[~last_cond]
    return train_df


def get_svd(temp_df, SVD_DIM=5):
    correct_svd_df = temp_df.groupby(['userID', 'assessmentItemID']).answerCode.sum().reset_index()
    pivot_svd_df = correct_svd_df.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0)

    svd = TruncatedSVD(n_components=SVD_DIM)
    svd_question = svd.fit_transform(pivot_svd_df.T)

    svd_q_dict = dict()
    for item, latent in zip(pivot_svd_df.T.index, svd_question):
        svd_q_dict[item] = latent

    svd = TruncatedSVD(n_components=SVD_DIM)
    svd_user = svd.fit_transform(pivot_svd_df)

    svd_u_dict = dict()
    for user, latent in zip(pivot_svd_df.index, svd_user):
        svd_u_dict[user] = latent
        
    return svd_u_dict, svd_q_dict


def get_lda(temp_df, LDA_DIM=5):
    lda_correct_df = temp_df[temp_df['answerCode'] == 1].copy()
    lda_mat = lda_correct_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
    lda_mat = lda_mat.astype(np.uint8)
    lda_mat = lda_mat.unstack(fill_value=0)

    lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
    lda_correct_npy = lda.fit_transform(lda_mat)
    
    lda_wrong_df = temp_df[temp_df['answerCode'] == 0].copy()
    lda_wrong_mat = lda_wrong_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
    lda_wrong_mat = lda_wrong_mat.astype(np.uint8)
    lda_wrong_mat = lda_wrong_mat.unstack(fill_value=0)

    lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
    lda_wrong_npy = lda.fit_transform(lda_wrong_mat)
    
    return lda_correct_npy, lda_wrong_npy

def get_word2vec(temp_df, EMB_DIM=10):
    emb_correct_df = temp_df[temp_df['answerCode'] == 1].copy()
    emb_correct_df = emb_correct_df.groupby('userID')['assessmentItemID'].apply(list)
    sentences = list(emb_correct_df.values)
    model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

    correct_question2vec = {}
    for content in model.wv.key_to_index.keys():
        correct_question2vec[content] = model.wv.get_vector(content)

    emb_wrong_df = temp_df[temp_df['answerCode'] == 0].copy()
    emb_wrong_df = emb_wrong_df.groupby('userID')['assessmentItemID'].apply(list)
    sentences = list(emb_wrong_df.values)
    model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

    wrong_question2vec = {}
    for content in model.wv.key_to_index.keys():
        wrong_question2vec[content] = model.wv.get_vector(content)
    
    return correct_question2vec, wrong_question2vec

In [6]:
train_df = load_train_data(df)


svd_u_dict, svd_q_dict = get_svd(train_df)
with open(f'./assets/svd_question.pickle','wb') as f:
    pickle.dump(svd_q_dict, f)
with open(f'./assets/svd_user.pickle','wb') as f:
    pickle.dump(svd_u_dict, f)
    
    
lda_correct_npy, lda_wrong_npy = get_lda(train_df)
np.save(f'./assets/lda_correct_question.npy', lda_correct_npy)
np.save(f'./assets/lda_wrong_question.npy', lda_wrong_npy)


correct_question2vec, wrong_question2vec = get_word2vec(train_df)
with open(f'./assets/word2vec_correct_question.pickle','wb') as f:
    pickle.dump(correct_question2vec, f)
with open(f'./assets/word2vec_wrong_question.pickle','wb') as f:
    pickle.dump(wrong_question2vec, f)