In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '/opt/ml/input/data'

train = pd.read_csv(path + '/train_data.csv')
test = pd.read_csv(path + '/test_data.csv')

In [3]:
# train, test 병합. 모든 데이터 활용하기.
dat = pd.concat([train, test], axis = 0)
dat = dat.sort_values(by = ['userID', 'Timestamp'])

In [4]:
data = dat[['userID', 'assessmentItemID', 'answerCode']].copy().reset_index(drop=True)
data.columns = ['userID', 'itemID', 'rating']

In [5]:
# rating이 -1로 되어있는 문제를 맞추기 위해 사용.
_train = data[data['rating'] >= 0]
_test = data[data['rating'] < 0]

In [6]:
_train = _train.drop_duplicates(['userID', 'itemID'], keep='last')

In [7]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(0,1))
_train_tmp = Dataset.load_from_df(_train.copy(), reader)

In [8]:
_trainset = _train_tmp.build_full_trainset()

In [9]:
_trainset

<surprise.trainset.Trainset at 0x7fd87130f310>

In [10]:
_train_tmp = train[['userID', 'assessmentItemID', 'answerCode', 'Timestamp']].copy().reset_index(drop=True)
_train_tmp.columns = ['userID', 'itemID', 'rating', 'Timestamp']

In [13]:
# 유저 마다 가장 마지막 문제를 맞추는 것을 기준으로 평가 하려함
# test 유저에 경우 가장 마지막 문제 전 문제를 맞추는 것을 기준으로 평가함.
user_final_time = _train_tmp.groupby('userID')['Timestamp'].max()
_train_tmp['train_valid'] = _train_tmp.apply(lambda x : -1 if x.Timestamp == user_final_time[x.userID] else x['rating'], axis = 1)
_valid = _train_tmp[_train_tmp['train_valid'] < 0]
_train = _train_tmp[_train_tmp['train_valid'] >= 0]

In [14]:
_train = _train.drop(columns='Timestamp')
_train = _train.drop(columns='train_valid')

In [15]:
_train

Unnamed: 0,userID,itemID,rating
0,0,A060001001,1
1,0,A060001002,1
2,0,A060001003,1
3,0,A060001004,1
4,0,A060001005,1
...,...,...,...
2266580,7441,A030071004,0
2266581,7441,A030071005,0
2266582,7441,A040165001,1
2266583,7441,A040165002,1


In [16]:
_train.nunique()

userID    6698
itemID    9454
rating       2
dtype: int64

In [17]:
_valid = _valid.drop(columns='Timestamp')
_valid = _valid.drop(columns='train_valid')

In [18]:
_valid

Unnamed: 0,userID,itemID,rating
744,0,A080129006,0
1677,1,A090074006,1
1953,2,A050139007,0
2785,5,A080138008,0
2786,5,A080138007,1
...,...,...,...
2266529,7436,A030019001,0
2266545,7437,A060003007,0
2266561,7438,A030188005,1
2266576,7440,A030197005,0


In [19]:
_valid.nunique()

userID    6698
itemID    1193
rating       2
dtype: int64

In [20]:
_train = Dataset.load_from_df(_train, reader)
trainset = _train.build_full_trainset()

In [21]:
testset = [_test.iloc[i].to_list() for i in range(len(_test))]

In [22]:
validset = [_valid.iloc[i].to_list() for i in range(len(_valid))]

In [23]:
from surprise import SVDpp

model = SVDpp(random_state=42)

In [24]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [25]:
n_epochs = [30, 40, 50]
learning_rate = [0.001, 0.005, 0.01]
n_factors = [100, 150, 200]

In [27]:
# cnt = 0

# for lr in learning_rate:
#     cnt += 1
#     print('processing ', cnt, '...')

#     model = SVDpp(random_state=42, n_factors=150, lr_all=lr)
    
#     model.fit(trainset)
#     valid_pred = model.test(validset)

#     output_list = []
#     for i in range(len(valid_pred)):
#         output_list.append(valid_pred[i].est)

#     print(roc_auc_score(_valid.rating, pd.DataFrame(output_list))) # auc
#     print(accuracy_score(_valid.rating, np.where(pd.DataFrame(output_list) >= 0.5, 1, 0))) # acc, 정확도
#     print('process ', cnt, 'learnig_rate = ', lr)

processing  1 ...


KeyboardInterrupt: 

In [28]:
model = SVDpp(random_state=42, n_epochs=15)

In [29]:
model.fit(_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fd7c40fe1f0>

In [30]:
model.test(testset)

[Prediction(uid=3, iid='A050133008', r_ui=-1, est=0.48303718704826076, details={'was_impossible': False}),
 Prediction(uid=4, iid='A070146008', r_ui=-1, est=0.6628326903779211, details={'was_impossible': False}),
 Prediction(uid=13, iid='A070111008', r_ui=-1, est=0.31137988727269394, details={'was_impossible': False}),
 Prediction(uid=17, iid='A090064006', r_ui=-1, est=0.5044226274131196, details={'was_impossible': False}),
 Prediction(uid=26, iid='A060135007', r_ui=-1, est=0.31760885565005004, details={'was_impossible': False}),
 Prediction(uid=29, iid='A020190005', r_ui=-1, est=0.869014984873854, details={'was_impossible': False}),
 Prediction(uid=45, iid='A040136005', r_ui=-1, est=0.5858088158238821, details={'was_impossible': False}),
 Prediction(uid=53, iid='A040140005', r_ui=-1, est=0.0312860159712667, details={'was_impossible': False}),
 Prediction(uid=58, iid='A070159007', r_ui=-1, est=0.17958618289652972, details={'was_impossible': False}),
 Prediction(uid=64, iid='A070146008'

In [31]:
pred = model.test(testset)
print('prediction type: ', type(pred))
print('size: ', len(pred))

prediction type:  <class 'list'>
size:  744


In [32]:
output_list = []
for i in range(len(pred)):
    output_list.append(pred[i].est)

In [33]:
submission = pd.DataFrame(columns=['id','prediction'])

In [34]:
submission['id'] = list(i for i in range(744))
submission['prediction'] = output_list

In [36]:
submission.to_csv('../output/surprise_svd_hp_tunned.csv', index = False)