In [1]:
import pandas as pd
import numpy as np
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
from tqdm import tqdm
import os
from collections import defaultdict

In [2]:
def win_probability(team1, team2):
    delta_mu = team1.mu - team2.mu
    sum_sigma = sum([team1.sigma ** 2, team2.sigma ** 2])
    size = 2
    denom = math.sqrt(size * (0.05 * 0.05) + sum_sigma)
    ts = trueskill.global_env()
    return ts.cdf(delta_mu / denom)


def add_trueskill(df, user_trueskill_dict, question_trueskill_dict):
    wp = np.zeros(len(df), dtype=np.float32)
    umu = np.zeros(len(df), dtype=np.float32)
    usigma = np.zeros(len(df), dtype=np.float32)
    qmu = np.zeros(len(df), dtype=np.float32)
    qsigma = np.zeros(len(df), dtype=np.float32)
    for cnt,row in enumerate(tqdm(df[['userID','assessmentItemID','answerCode']].values)):
        user_id=int(row[0])
        content_id=row[1]
        answered_correctly=int(row[2])
        old_user_rating = user_trueskill_dict[user_id]
        old_question_rating = question_trueskill_dict[content_id]
        wp[cnt] = win_probability(old_user_rating,old_question_rating)
        umu[cnt] = old_user_rating.mu
        usigma[cnt] = old_user_rating.sigma
        qmu[cnt] = old_question_rating.mu
        qsigma[cnt] = old_question_rating.sigma
        if answered_correctly == 1:
            new_user_rating,new_question_rating = rate_1vs1(old_user_rating,old_question_rating)
        else:
            new_question_rating,new_user_rating = rate_1vs1(old_question_rating,old_user_rating)
        user_trueskill_dict[user_id] = new_user_rating
        question_trueskill_dict[content_id] = new_question_rating
        
    df['trueSkill_win_probability']=wp
    df['trueSkill_user_mu']=umu
    df['trueSkill_user_sigma']=usigma
    df['trueSkill_question_mu']=qmu
    df['trueSkill_question_sigma']=qsigma
    
    return df


def add_trueskill_without_update(df, user_trueskill_dict, question_trueskill_dict):
    wp = np.zeros(len(df), dtype=np.float32)
    umu = np.zeros(len(df), dtype=np.float32)
    usigma = np.zeros(len(df), dtype=np.float32)
    qmu = np.zeros(len(df), dtype=np.float32)
    qsigma = np.zeros(len(df), dtype=np.float32)
    for cnt,row in (enumerate(tqdm(df[['userID','assessmentItemID']].values))):
        user_id=int(row[0])
        content_id=row[1]
        old_user_rating = user_trueskill_dict[user_id]
        old_question_rating = question_trueskill_dict[content_id]
        wp[cnt] = win_probability(old_user_rating,old_question_rating)
        umu[cnt] = old_user_rating.mu
        usigma[cnt] = old_user_rating.sigma
        qmu[cnt] = old_question_rating.mu
        qsigma[cnt] = old_question_rating.sigma
        
    df['trueSkill_win_probability']=wp
    df['trueSkill_user_mu']=umu
    df['trueSkill_user_sigma']=usigma
    df['trueSkill_question_mu']=qmu
    df['trueSkill_question_sigma']=qsigma
    
    return df

def load_train_data(df):
    last_cond = df['userID'] != df['userID'].shift(-1)
    train_df = df[~last_cond]
    return train_df

In [3]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) 
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [4]:
test_cond = df['userID'] != df['userID'].shift(-1)
test_df = df[test_cond].copy()
train_df = df[~test_cond].copy()

len(train_df) + len(test_df)

2526700

In [5]:
user_trueskill_dict = defaultdict(lambda:Rating())
question_trueskill_dict = defaultdict(lambda:Rating())
train_df = add_trueskill(train_df, user_trueskill_dict, question_trueskill_dict)
test_df = add_trueskill_without_update(test_df, user_trueskill_dict, question_trueskill_dict)

100%|██████████| 2519258/2519258 [13:32<00:00, 3099.14it/s]
100%|██████████| 7442/7442 [00:00<00:00, 120964.38it/s]


In [6]:
df = pd.concat([train_df, test_df])
csv_save_path = os.path.join(data_dir, 'total_data_v2.csv')
df.to_csv(csv_save_path, index=False)