In [1]:
import pandas as pd
import numpy as np
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
from tqdm import tqdm
import os
from collections import defaultdict
import pickle

In [2]:
def win_probability(team1, team2):
    delta_mu = team1.mu - team2.mu
    sum_sigma = sum([team1.sigma ** 2, team2.sigma ** 2])
    size = 2
    denom = math.sqrt(size * (0.05 * 0.05) + sum_sigma)
    ts = trueskill.global_env()
    return ts.cdf(delta_mu / denom)


def add_trueskill(df, user_trueskill_dict, question_trueskill_dict, cv_num):
    wp = np.zeros(len(df), dtype=np.float32)
    umu = np.zeros(len(df), dtype=np.float32)
    usigma = np.zeros(len(df), dtype=np.float32)
    qmu = np.zeros(len(df), dtype=np.float32)
    qsigma = np.zeros(len(df), dtype=np.float32)
    for cnt,row in enumerate(tqdm(df[['userID','assessmentItemID','answerCode']].values)):
        user_id=int(row[0])
        content_id=row[1]
        answered_correctly=int(row[2])
        old_user_rating = user_trueskill_dict[user_id]
        old_question_rating = question_trueskill_dict[content_id]
        wp[cnt] = win_probability(old_user_rating,old_question_rating)
        umu[cnt] = old_user_rating.mu
        usigma[cnt] = old_user_rating.sigma
        qmu[cnt] = old_question_rating.mu
        qsigma[cnt] = old_question_rating.sigma
        if answered_correctly == 1:
            new_user_rating,new_question_rating = rate_1vs1(old_user_rating,old_question_rating)
        else:
            new_question_rating,new_user_rating = rate_1vs1(old_question_rating,old_user_rating)
        user_trueskill_dict[user_id] = new_user_rating
        question_trueskill_dict[content_id] = new_question_rating
        
    df[f'trueSkill_win_probability_cv{cv_num}']=wp
    df[f'trueSkill_user_mu_cv{cv_num}']=umu
    df[f'trueSkill_user_sigma_cv{cv_num}']=usigma
    df[f'trueSkill_question_mu_cv{cv_num}']=qmu
    df[f'trueSkill_question_sigma_cv{cv_num}']=qsigma
    
    return df


def add_trueskill_without_update(df, user_trueskill_dict, question_trueskill_dict,cv_num):
    wp = np.zeros(len(df), dtype=np.float32)
    umu = np.zeros(len(df), dtype=np.float32)
    usigma = np.zeros(len(df), dtype=np.float32)
    qmu = np.zeros(len(df), dtype=np.float32)
    qsigma = np.zeros(len(df), dtype=np.float32)
    for cnt,row in (enumerate(tqdm(df[['userID','assessmentItemID']].values))):
        user_id=int(row[0])
        content_id=row[1]
        old_user_rating = user_trueskill_dict[user_id]
        old_question_rating = question_trueskill_dict[content_id]
        wp[cnt] = win_probability(old_user_rating,old_question_rating)
        umu[cnt] = old_user_rating.mu
        usigma[cnt] = old_user_rating.sigma
        qmu[cnt] = old_question_rating.mu
        qsigma[cnt] = old_question_rating.sigma
        
    df[f'trueSkill_win_probability_cv{cv_num}']=wp
    df[f'trueSkill_user_mu_cv{cv_num}']=umu
    df[f'trueSkill_user_sigma_cv{cv_num}']=usigma
    df[f'trueSkill_question_mu_cv{cv_num}']=qmu
    df[f'trueSkill_question_sigma_cv{cv_num}']=qsigma
    
    return df

In [3]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) 
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [4]:
def valid_update(df, cv_num):
    users_file_path = os.path.join(data_dir, f'cv1_users.pickle')
    with open(users_file_path,'rb') as f:
        users = pickle.load(f)
    train_users = users['train_users']
    test_users = users['test_users']

    valid_cond1 = df['userID'].isin(train_users) == False
    valid_cond2 = df['userID'].isin(test_users) == False
    cv_idx = df[valid_cond1&valid_cond2].groupby('userID', as_index=False).nth(-cv_num).index
    valid_idx = df[valid_cond1&valid_cond2].groupby('userID').tail(cv_num).index
    
    df['cv_idx'] = False
    df['is_valid'] = False
    
    df.loc[cv_idx, 'cv_idx'] = True
    df.loc[valid_idx, 'is_valid'] = True
    
    return df

In [5]:
cv_len = 5

for cv_num in range(1, 1+cv_len):
    df = valid_update(df, cv_num)
    
    cond1 = df['is_valid'] == True
    cond2 = df['answerCode'] == -1
    test_df = df[cond1|cond2].copy()
    train_df = df[~(cond1|cond2)].copy()

    print(len(train_df),len(test_df), len(train_df) + len(test_df))
    
    user_trueskill_dict = defaultdict(lambda:Rating())
    question_trueskill_dict = defaultdict(lambda:Rating())
    train_df = add_trueskill(train_df, user_trueskill_dict, question_trueskill_dict, cv_num)
    test_df = add_trueskill_without_update(test_df, user_trueskill_dict, question_trueskill_dict, cv_num)
    
    df = pd.concat([train_df, test_df])
    df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

2523949 2751 2526700


100%|██████████| 2523949/2523949 [11:28<00:00, 3666.77it/s]
100%|██████████| 2751/2751 [00:00<00:00, 127954.25it/s]


2521942 4758 2526700


100%|██████████| 2521942/2521942 [10:59<00:00, 3823.96it/s]
100%|██████████| 4758/4758 [00:00<00:00, 122337.19it/s]


2519935 6765 2526700


100%|██████████| 2519935/2519935 [11:00<00:00, 3815.07it/s]
100%|██████████| 6765/6765 [00:00<00:00, 142414.22it/s]


2517928 8772 2526700


100%|██████████| 2517928/2517928 [11:06<00:00, 3780.27it/s]
100%|██████████| 8772/8772 [00:00<00:00, 140395.92it/s]


2515921 10779 2526700


100%|██████████| 2515921/2515921 [11:16<00:00, 3718.02it/s]
100%|██████████| 10779/10779 [00:00<00:00, 143479.54it/s]


In [6]:
# df = pd.concat([train_df, test_df])
csv_save_path = os.path.join(data_dir, 'total_data_v2.csv')
df.to_csv(csv_save_path, index=False)