In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns; sns.set_theme(color_codes=True)
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from gensim.models import Word2Vec
from numpy.linalg import svd
import pickle

In [2]:
data_dir = '/opt/ml/project/data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'total_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) 
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [3]:
users_file_path = os.path.join(data_dir, 'cv1_users.pickle')
with open(users_file_path,'rb') as f:
    users = pickle.load(f)
train_users, test_users = users['train_users'], users['test_users']

In [5]:
last_test = df['answerCode'] == -1
temp_df = df[~last_test].copy()
valid_cond = temp_df['userID'].isin(train_users) == False
last_cond = temp_df['userID'] != temp_df['userID'].shift(-1)
last_valid = valid_cond & last_cond

temp_df = temp_df[~last_valid]

In [7]:
SVD_DIM = 5

In [None]:
correct_svd_df = temp_df.groupby(['userID', 'assessmentItemID']).answerCode.sum().reset_index()
pivot_svd_df = correct_svd_df.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0)

svd = TruncatedSVD(n_components=SVD_DIM)
svd.fit(pivot_svd_df.T)
transformed = svd.transform(pivot_svd_df.T)

svd_dict = dict()
for item, latent in zip(pivot_svd_df.T.index, transformed):
    svd_dict[item] = latent

with open('./assets/svd.pickle','wb') as f:
    pickle.dump(svd_dict, f)

In [14]:
LDA_DIM = 5

In [19]:
lda_correct_df = temp_df[temp_df['answerCode'] == 1].copy()
lda_mat = lda_correct_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
lda_mat = lda_mat.astype(np.uint8)
lda_mat = lda_mat.unstack(fill_value=0)

lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
transformed = lda.fit_transform(lda_mat)
np.save(f'./assets/lda_correct_question.npy', transformed)

In [20]:
lda_wrong_df = temp_df[temp_df['answerCode'] == 0].copy()
lda_wrong_mat = lda_wrong_df.groupby(['assessmentItemID', 'userID'])['answerCode'].count()
lda_wrong_mat = lda_wrong_mat.astype(np.uint8)
lda_wrong_mat = lda_wrong_mat.unstack(fill_value=0)

lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
transformed = lda.fit_transform(lda_wrong_mat)
np.save(f'./assets/lda_wrong_question.npy', transformed)

In [13]:
EMB_DIM = 10

In [17]:
emb_correct_df = temp_df[temp_df['answerCode'] == 1].copy()
emb_correct_df = emb_correct_df.groupby('userID')['assessmentItemID'].apply(list)
sentences = list(emb_correct_df.values)
model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

question2vec = {}
for content in model.wv.key_to_index.keys():
    question2vec[content] = model.wv.get_vector(content)

with open('./assets/word2vec_correct_question.pickle','wb') as f:
    pickle.dump(question2vec, f)

In [18]:
emb_wrong_df = temp_df[temp_df['answerCode'] == 0].copy()
emb_wrong_df = emb_wrong_df.groupby('userID')['assessmentItemID'].apply(list)
sentences = list(emb_wrong_df.values)
model = Word2Vec(sentences, vector_size=EMB_DIM, window=100, seed=0, workers=16)

question2vec = {}
for content in model.wv.key_to_index.keys():
    question2vec[content] = model.wv.get_vector(content)
    
with open('./assets/word2vec_wrong_question.pickle','wb') as f:
    pickle.dump(question2vec, f)