In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py
import math

from concurrent import futures
#import riiideducation
from trueskill import Rating, quality_1vs1, rate_1vs1, TrueSkill, global_env

In [2]:
env = TrueSkill(draw_probability=0.0)

In [3]:
env.create_rating()

trueskill.Rating(mu=25.000, sigma=8.333)

In [4]:
train = pd.read_feather("./final_train.feather")

In [5]:
train = train[train["answered_correctly"] != -1]

In [6]:
train.head(2)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,32933156,0,705741139,128,0,0,0,1,,0
1,32933157,20666,705741139,7860,0,1,0,1,16000.0,0


In [7]:
users = np.unique(train["user_id"])
questions = np.unique(train["content_id"])

In [8]:
u_rate = {}
for user in users:
    u_rate[user] = env.create_rating()

q_rate = {}
for question in questions:
    q_rate[question] = env.create_rating()

In [9]:
len(u_rate), len(q_rate)

(393656, 13523)

In [10]:
def win_probability(team1, team2):
    delta_mu = team1.mu - team2.mu
    sum_sigma = sum([team1.sigma ** 2, team2.sigma ** 2])
    size = 2
    denom = math.sqrt(size * (0.05 * 0.05) + sum_sigma)
    #ts = global_env()
    return env.cdf(delta_mu / denom)

In [11]:
temp_train = train.head(10000).copy()

In [None]:
count = 0
output_list = list()
print("Creating Feature")
for row in tqdm(train.values):
    row_id, uid, cid, ac = int(row[0]), int(row[2]), int(row[3]), int(row[7])
    
    output = {}
    output["row_id"] = row_id
    output["uid"] = uid
    output["cid"] = cid
    output["ac"] = ac
    
    count = count + 1
    old_u_rate = u_rate[uid]
    old_q_rate = q_rate[cid]
    output["u_mu"] = old_u_rate.mu
    output["u_sigma"] = old_u_rate.sigma
    output["q_mu"] = old_q_rate.mu
    output["q_sigma"] = old_q_rate.sigma
    output["prob"] = win_probability(old_u_rate, old_q_rate)
    
    if ac == 1:
        new_u_rate, new_q_rate = rate_1vs1(old_u_rate, old_q_rate)
    if ac == 0:
        new_q_rate, new_u_rate = rate_1vs1(old_q_rate, old_u_rate)
        
    u_rate[uid] = new_u_rate
    q_rate[cid] = new_q_rate
    
    output_list.append(output)
    

Creating Feature


  2%|▏         | 2015111/99271300 [12:34<8:33:21, 3157.53it/s] 

In [None]:
df = pd.DataFrame(output_list)

In [None]:
df.head()

In [None]:
df.to_feather("./true_skill_df.feather")

In [None]:
with open("./true_skill_q_rate_0104.pkl", "wb") as handle:
    pickle.dump(q_rate, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(len(q_rate))
with open("./true_skill_u_rate_0104.pkl", "wb") as handle:
    pickle.dump(u_rate, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(len(u_rate))

In [None]:
# with open("./true_skill_q_rate_1231.pkl", "wb") as handle:
#     pickle.dump(q_rate, handle, protocol=pickle.HIGHEST_PROTOCOL)
# print(len(q_rate))

In [None]:
# with open("./true_skill_u_rate_1231.pkl", "wb") as handle:
#     pickle.dump(u_rate, handle, protocol=pickle.HIGHEST_PROTOCOL)
# print(len(u_rate))

In [None]:
# with open("./true_skill_prob_1231.pkl", "wb") as handle:
#     pickle.dump(prob_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
# print(len(prob_list))

In [2]:
alice, bob = Rating(25), Rating(30)

In [3]:
a, b = rate_1vs1(alice, bob) 

In [7]:
alice.mu

25.0

In [6]:
a.mu

30.76806754361951

In [2]:
def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return beta - learning_rate_beta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
    )

def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

def learning_rate_beta(nb_answers):
    return 1 / (1 + 0.05 * nb_answers)

def probability_of_good_answer(theta, beta, left_asymptote):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
def estimate_parameters(answers_df, granularity_feature_name='content_id'):
    item_parameters = {
        granularity_feature_value: {"beta": 0, "nb_answers": 0}
        for granularity_feature_value in np.unique(answers_df[granularity_feature_name])
    }
    student_parameters = {
        student_id: {"theta": 0, "nb_answers": 0}
        for student_id in np.unique(answers_df.student_id)
    }

    print("Parameter estimation is starting...")

    for student_id, item_id, left_asymptote, answered_correctly in tqdm(
        zip(answers_df.student_id.values, answers_df[granularity_feature_name].values, answers_df.left_asymptote.values, answers_df.answered_correctly.values)
    ):
        theta = student_parameters[student_id]["theta"]
        beta = item_parameters[item_id]["beta"]

        item_parameters[item_id]["beta"] = get_new_beta(
            answered_correctly, beta, left_asymptote, theta, item_parameters[item_id]["nb_answers"],
        )
        student_parameters[student_id]["theta"] = get_new_theta(
            answered_correctly, beta, left_asymptote, theta, student_parameters[student_id]["nb_answers"],
        )
        
        item_parameters[item_id]["nb_answers"] += 1
        student_parameters[student_id]["nb_answers"] += 1

    print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
    return student_parameters, item_parameters



In [4]:
def update_parameters(answers_df, student_parameters, item_parameters, granularity_feature_name='content_id'):
    for student_id, item_id, left_asymptote, answered_correctly in tqdm(zip(
        answers_df.student_id.values, 
        answers_df[granularity_feature_name].values, 
        answers_df.left_asymptote.values, 
        answers_df.answered_correctly.values)
    ):
        if student_id not in student_parameters:
            student_parameters[student_id] = {'theta': 0, 'nb_answers': 0}
        if item_id not in item_parameters:
            item_parameters[item_id] = {'beta': 0, 'nb_answers': 0}
            
        theta = student_parameters[student_id]['theta']
        beta = item_parameters[item_id]['beta']

        student_parameters[student_id]['theta'] = get_new_theta(
            answered_correctly, beta, left_asymptote, theta, student_parameters[student_id]['nb_answers']
        )
        item_parameters[item_id]['beta'] = get_new_beta(
            answered_correctly, beta, left_asymptote, theta, item_parameters[item_id]['nb_answers']
        )
        
        student_parameters[student_id]['nb_answers'] += 1
        item_parameters[item_id]['nb_answers'] += 1
        
    return student_parameters, item_parameters

In [5]:
def estimate_probas(test_df, student_parameters, item_parameters, granularity_feature_name='content_id'):
    probability_of_success_list = []
    
    for student_id, item_id, left_asymptote in tqdm(
        zip(test_df.student_id.values, test_df[granularity_feature_name].values, test_df.left_asymptote.values)
    ):
        theta = student_parameters[student_id]['theta'] if student_id in student_parameters else 0
        beta = item_parameters[item_id]['beta'] if item_id in item_parameters else 0

        probability_of_success_list.append(probability_of_good_answer(theta, beta, left_asymptote))

    return probability_of_success_list

In [6]:
train = pd.read_feather("./train_sorted_full2.feather")

In [7]:


train.rename(columns={'user_id': 'student_id'}, inplace=True)
train = train[train.answered_correctly != -1]
train['left_asymptote'] = 1/4

print(f"Dataset of shape {train.shape}")
print(f"Columns are {list(train.columns)}")

student_parameters, item_parameters = estimate_parameters(train)

Dataset of shape (99271300, 11)
Columns are ['row_id', 'timestamp', 'student_id', 'content_id', 'content_type_id', 'task_container_id', 'user_answer', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'left_asymptote']


5906it [00:00, 59055.27it/s]

Parameter estimation is starting...


99271300it [27:40, 59790.31it/s]

Theta & beta estimations on content_id are completed.





In [9]:
len(student_parameters)

393656

In [10]:
len(item_parameters)

13523

In [13]:
with open("./elo-user.pkl", "wb") as handle:
    pickle.dump(student_parameters, handle, pickle.HIGHEST_PROTOCOL)

In [14]:
with open("./elo-item.pkl", "wb") as handle:
    pickle.dump(item_parameters, handle, pickle.HIGHEST_PROTOCOL)

In [11]:
item_parameters

{0: {'beta': -2.467006024885898, 'nb_answers': 6903},
 1: {'beta': -2.2013691473715693, 'nb_answers': 7398},
 2: {'beta': 0.7132896484106421, 'nb_answers': 44905},
 3: {'beta': -0.7515245314119663, 'nb_answers': 22973},
 4: {'beta': 0.09911937529875181, 'nb_answers': 31736},
 5: {'beta': -1.6552570514689817, 'nb_answers': 9727},
 6: {'beta': 1.1679519464592518, 'nb_answers': 56707},
 7: {'beta': -2.00968734771434, 'nb_answers': 16585},
 8: {'beta': -2.534171747143152, 'nb_answers': 8535},
 9: {'beta': 3.2318669150064028, 'nb_answers': 47346},
 10: {'beta': -0.10929508507949479, 'nb_answers': 31024},
 11: {'beta': -2.2023850284996547, 'nb_answers': 6694},
 12: {'beta': -0.7186547778617037, 'nb_answers': 20294},
 13: {'beta': -2.1163530695388104, 'nb_answers': 8991},
 14: {'beta': -1.1517815909320754, 'nb_answers': 15125},
 15: {'beta': -0.5283099055771918, 'nb_answers': 20295},
 16: {'beta': 1.4952978083826323, 'nb_answers': 5439},
 17: {'beta': -1.8281626715055088, 'nb_answers': 8503},

In [12]:
student_parameters

{115: {'theta': -0.05451966128116671, 'nb_answers': 46},
 124: {'theta': -1.272724977623159, 'nb_answers': 30},
 2746: {'theta': -0.524965122797141, 'nb_answers': 19},
 5382: {'theta': 0.36443315072222354, 'nb_answers': 125},
 8623: {'theta': 0.8041310912962591, 'nb_answers': 109},
 8701: {'theta': -0.14973040362155537, 'nb_answers': 17},
 12741: {'theta': 0.4496873860256579, 'nb_answers': 265},
 13134: {'theta': 0.6835671481928678, 'nb_answers': 1243},
 24418: {'theta': 0.5611953830899238, 'nb_answers': 6283},
 24600: {'theta': -1.4601891858651435, 'nb_answers': 50},
 32421: {'theta': 0.0863284342524998, 'nb_answers': 30},
 40828: {'theta': 0.6470410728399918, 'nb_answers': 92},
 44331: {'theta': -0.5004735483713841, 'nb_answers': 291},
 45001: {'theta': -1.1196729179157618, 'nb_answers': 30},
 46886: {'theta': 0.13523836993677885, 'nb_answers': 44},
 50132: {'theta': -0.4495272013679308, 'nb_answers': 74},
 51285: {'theta': -0.5513571857103031, 'nb_answers': 22},
 53842: {'theta': -1