In [None]:
import pandas as pd
import numpy as np
from datetime import *
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf
import requests
from sklearn import *

In [None]:
df = pd.concat([pd.read_csv("data/lichess_swiss_rating_histories_1.csv",parse_dates=['date']),
                pd.read_csv("data/lichess_swiss_rating_histories_2.csv",parse_dates=['date'])])
print(df.shape)
df.head()

In [None]:
df['user_id'].nunique()

In [None]:
# The latest date we have data on
max_outcome_date = df['date'].max()
# The latest date that can be used for training to ensure we'll always have 2 years in advance of outcomes data
max_training_date = max_outcome_date - timedelta(days=365*2)
max_outcome_date,max_training_date

In [None]:
# The earliest date we have data on
minn_training_date = df['date'].min()
minn_training_date

In [None]:
# The latest ratings that can be used for training
df_training = df.query('date<=@max_training_date')
df_outcomes = df.query('date>@max_training_date')
latest_training_ratings = df_training.sort_values("date",ascending=False).drop_duplicates(['user_id','time_control'])
latest_training_ratings.sample(5)

In [None]:
latest_training_ratings.shape

In [None]:
latest_training_ratings['rating'].value_counts().head()

# Feature Engineering

## Features
- Target time control (likely interacted with various other features)
- Target rating gain
- Current rating (likely nonlinear relationship)
- Rating growth in last 30 days / 90 days / 180 days
- Rating volatility measures
- Peak historical rating relative to current rating
- Rating in other time controls + puzzles
- Rating growth in other time controls + puzzles
- Difference between other time control ratings + target time control rating
- How long you've been on lichess
- How many games you've played (ever, and within last 30 days, and within the target time control - if you haven't played many it could mean more uncertainty). Consider that most discord bot users will have played more recent rated games in the target time control than the typical user in the training data. 
- Last time you played a rated game in the target time control (if it's a long time ago, it could mean more uncertainty)
- Have you recently been playing puzzles? What about slow games? That could indicate seriousness about improvement

## Outcomes
- Will you ever achieve a rating that's X rating points higher than your current rating in the next Y months (X is calculated from target rating submitted by user, Y = 24?)
- If so, when will you first reach the target rating? (point estimate + prediction interval of dates) - use number of days as outcome, then transform to date for the bot message
- Try to tweak model to avoid negative predictions, and manually override when needed. Same with predictions greater than 2 years out.


In [None]:
# Ratings from X days before the max training date
max_training_date_minus_30 = max_training_date-timedelta(days=30)
max_training_date_minus_90 = max_training_date-timedelta(days=90)
max_training_date_minus_180 = max_training_date-timedelta(days=180)
hist_ratings_30 = df_training.query('date<=@max_training_date_minus_30').sort_values("date",ascending=False).drop_duplicates(['user_id','time_control'])
hist_ratings_90 = df_training.query('date<=@max_training_date_minus_90').sort_values("date",ascending=False).drop_duplicates(['user_id','time_control'])
hist_ratings_180 = df_training.query('date<=@max_training_date_minus_180').sort_values("date",ascending=False).drop_duplicates(['user_id','time_control'])
hist_ratings_180.head()

In [None]:
# Peak ratings
hist_ratings_peak = df_training.sort_values("rating",ascending=False).drop_duplicates(['user_id','time_control'])
outcome_ratings_peak = df_outcomes.sort_values("rating",ascending=False).drop_duplicates(['user_id','time_control'])
hist_ratings_peak.head()

In [None]:
# Rating volatility
rating_stdev_30 = df_training.query('date>=@max_training_date_minus_30').groupby(['user_id','time_control'])['rating'].std().fillna(0).reset_index().rename(columns={"rating":"rating_stdev_30"})
rating_stdev_90 = df_training.query('date>=@max_training_date_minus_90').groupby(['user_id','time_control'])['rating'].std().fillna(0).reset_index().rename(columns={"rating":"rating_stdev_90"})
rating_stdev_180 = df_training.query('date>=@max_training_date_minus_180').groupby(['user_id','time_control'])['rating'].std().fillna(0).reset_index().rename(columns={"rating":"rating_stdev_180"})
rating_stdev_30.head()

In [None]:
# Frequency of rating updates
rating_updates_30 = df_training.query('date>=@max_training_date_minus_30').groupby(['user_id','time_control']).size().reset_index().rename(columns={0:"rating_updates_30"})
rating_updates_90 = df_training.query('date>=@max_training_date_minus_90').groupby(['user_id','time_control']).size().reset_index().rename(columns={0:"rating_updates_90"})
rating_updates_90.head()

In [None]:
# Non-target time control ratings
non_target_rating_updates_30 = rating_updates_30.pivot(index='user_id',columns='time_control',values='rating_updates_30').fillna(0)
non_target_rating_updates_30.columns = ['blitz_updates_30','bullet_updates_30','classical_updates_30','rapid_updates_30']
non_target_rating_updates_30.head()

In [None]:
# Add features to base table
df_base = latest_training_ratings.merge(hist_ratings_30[['user_id','time_control','rating']],
                how='left',on=['user_id','time_control'],suffixes=['_latest','_30']).merge(
            hist_ratings_90[['user_id','time_control','rating']],
                how='left',on=['user_id','time_control']).merge(
            hist_ratings_180[['user_id','time_control','rating']],
                how='left',on=['user_id','time_control'],suffixes=['_90','_180']).merge(
            hist_ratings_peak[['user_id','time_control','rating']].rename(columns={'rating':'rating_peak'}),
                how='left',on=['user_id','time_control']).merge(
            rating_updates_30,how='left',on=['user_id','time_control']).merge(
            rating_updates_90,how='left',on=['user_id','time_control']).merge(
            rating_stdev_30,how='left',on=['user_id','time_control']).merge(
            rating_stdev_90,how='left',on=['user_id','time_control']).merge(
            rating_stdev_180,how='left',on=['user_id','time_control']).merge(
            non_target_rating_updates_30,how='left',on='user_id'
)
df_base['rating_30_diff'] = df_base['rating_latest']-df_base['rating_30']
df_base['rating_90_diff'] = (df_base['rating_latest']-df_base['rating_90']).combine_first(df_base['rating_30_diff'])
df_base['rating_180_diff'] = (df_base['rating_latest']-df_base['rating_180']).combine_first(df_base['rating_90_diff'])
df_base['rating_peak_diff'] = df_base['rating_latest']-df_base['rating_peak']
df_base['time_control_copy'] = df_base['time_control']
df_base['rating_latest_rounded'] = df_base['rating_latest'].round(-2)
df_base['rating_latest_squared'] = df_base['rating_latest']**2
df_base['rating_latest_rounded_200'] = 200*np.ceil(df_base['rating_latest_rounded']/200).astype(int)
df_base['rating_latest_rounded_300'] = 300*np.ceil(df_base['rating_latest_rounded']/300).astype(int)
df_base = pd.get_dummies(df_base,columns=['time_control_copy'],prefix_sep="")
df_base.columns = [x.replace("time_control_copy","").lower() for x in df_base.columns]
print(df_base.shape)
df_base.sample(10)

In [None]:
# Filter to people who have played rated games in the time control before 30 days ago...
# ... and have played at least one rated game in the time control within the last 30 days
df_base = df_base[(df_base['rating_30'].notna())&(df_base['date']>=max_training_date_minus_30)]
print(df_base.shape)
print(df_base['user_id'].nunique())
df_base.head()

In [None]:
# What is the distribution of rating gains over the two year period?
## Use this to come up with reasonable target rating ranges where I'll have a decent sample size to work with when estimating how long it'll take
df_max_rating_gains = df_base.merge(outcome_ratings_peak,on=['user_id','time_control'],how='inner')
df_max_rating_gains['max_gain'] = df_max_rating_gains['rating']-df_max_rating_gains['rating_latest']
df_max_rating_gains['rating_bucket'] = df_max_rating_gains['rating_latest'].apply(lambda x: 1 if x < 1550 else (2 if x < 1900 else 3))
df_max_rating_gains.groupby("rating_bucket")['max_gain'].describe(percentiles=[.25,.5,.75,.9,.95,.99])

In [None]:
# Generate target ratings
df_targets = pd.concat([df_base for x in range(5)])
np.random.seed(1)
def get_target_rating_gain(x):
    # Right side of interval is exclusive, so this goes from 1-3
    die = np.random.randint(1,4)
    if die == 1:
        return np.random.randint(1,100)
    elif die == 2:
        return np.random.randint(1,300)
    elif die == 3:
        if x < 1550:
            return np.random.randint(100,700)
        elif x < 1900:
            return np.random.randint(100,500)
        else:
            return np.random.randint(100,400)
    else:
        print(1/0)

df_targets['target_rating_gain'] = df_targets['rating_latest'].apply(get_target_rating_gain)
df_targets.drop_duplicates(subset=['user_id','time_control','target_rating_gain'],inplace=True)
df_targets['target_rating'] = df_targets['rating_latest'] + df_targets['target_rating_gain']
df_targets['target_rating_gain_rounded'] = df_targets['target_rating_gain'].round(-2)
df_targets['target_rating_gain_squared'] = df_targets['target_rating_gain']**2
print(df_targets.shape)
df_targets.head()

In [None]:
df_targets.groupby("rating_latest_rounded")['target_rating_gain'].describe().round()

In [None]:
(ggplot(df_targets.sample(2000),aes(x='rating_latest',y='target_rating')) +
 geom_point(size=.1) +
 scale_x_continuous(breaks=list(range(800,2500,200))) +
  scale_y_continuous(breaks=list(range(800,2500,200)))

        
       )

In [None]:
df_temp = df_targets[['user_id','time_control','target_rating','date']].copy()
df_temp = df_temp.merge(df_outcomes,on=['user_id','time_control'],how='outer',suffixes=['_latest','_future'])
print(df_temp.shape)
df_temp.head()

In [None]:
# Successes - filter to where future rating >= target rating, then take earliest date for each user/time control
df_successes = df_temp.query('rating>=target_rating').sort_values("date_future").drop_duplicates(['user_id','time_control','target_rating'])
print(df_successes.shape)
df_successes.sample(5)

In [None]:
# Successes and failures 
df_bin = df_targets.merge(df_successes[['user_id','time_control','target_rating','date_future']],on=['user_id','time_control','target_rating'],how='left')
# Was the target rating achieved?
df_bin['y_bin'] = df_bin['date_future'].notna().astype(int)
# If so, when?
df_bin['y_cont'] = (df_bin['date_future']-max_training_date).dt.days
print(df_bin.shape)
df_bin.sample(10)

In [None]:
df_cont = df_bin[df_bin['y_bin']==1].copy()

# EDA

In [None]:
y_bin_by_rating = df_bin.groupby(["rating_latest_rounded","time_control"])['y_bin'].agg([np.mean,len])
(ggplot(y_bin_by_rating[y_bin_by_rating['len']>=25].reset_index(),
        aes(x='rating_latest_rounded',y='mean',color='time_control')) +
 geom_point() +
      scale_x_continuous(breaks=list(range(600,2600,200))) +
         ylim([0,1])
       )

In [None]:
y_bin_by_gain = df_bin.groupby(["target_rating_gain_rounded","time_control"])['y_bin'].agg([np.mean,len])
(ggplot(y_bin_by_gain[y_bin_by_gain['len']>=25].reset_index(),
        aes(x='target_rating_gain_rounded',y='mean',color='time_control')) +
 geom_point() +
         ylim([0,1])
       )

In [None]:
bin_by_quant_vars = df_bin.groupby(['target_rating_gain_rounded','rating_latest_rounded'])['y_bin'].mean().reset_index().round(2)
bin_by_quant_vars.pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='y_bin').iloc[:-2,6:-7]

In [None]:
(ggplot(bin_by_quant_vars.query("rating_latest_rounded>=1200"),aes(x='target_rating_gain_rounded',y='y_bin',group='rating_latest_rounded',
       color='rating_latest_rounded')) +
geom_line())

As target rating gain increases, the effect of latest rating should go from zero to more negative.

In [None]:
mean_days_by_quant_vars = df_cont.groupby(['target_rating_gain_rounded','rating_latest_rounded'])['y_cont'].mean().reset_index().round()
mean_days_by_quant_vars.pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='y_cont').iloc[:-1,6:-6]

In [None]:
mean_days_by_quant_vars.head()

In [None]:
y_cont_by_rating = df_cont.groupby(["rating_latest_rounded",'time_control'])['y_cont'].agg([np.mean,len]).reset_index()
y_cont_by_rating.head()

In [None]:
(ggplot(y_cont_by_rating.query('len>=25'),aes(x='rating_latest_rounded',y='mean',color='time_control')) +
        geom_point() +
    ylim(0,500) +
     scale_x_continuous(breaks=list(range(600,2600,200)))
        
       )

In [None]:
mean_outcome_by_group = df_cont.groupby(['target_rating_gain_rounded','time_control','rating_latest_rounded'])['y_cont'].agg([np.mean,len]).reset_index()
(ggplot(mean_outcome_by_group.query('len>=30'),aes(x='target_rating_gain_rounded',y='mean')) +
      geom_point())

In [None]:
# Outlier checks
## High gains
df_cont['target_rating_gain'].describe(percentiles=[x/10 for x in range(10)])

In [None]:
## Quick gains
df_cont[df_cont['target_rating_gain']>100]['y_cont'].describe(percentiles=[x/10 for x in range(10)])

## Identify Fast-improvers

In [None]:
df_cont['fast_improver'] = (df_cont['y_cont']<34).astype(int)
df_cont.groupby("fast_improver")['rating_30_diff'].describe()

In [None]:
df_cont.groupby(["fast_improver","time_control"])['rating_updates_30'].describe()

## Non-target ratings

In [None]:
df_cont[['y_cont','rating_updates_30','blitz_updates_30','bullet_updates_30','rapid_updates_30','classical_updates_30']].corr()

### Train/Test split

In [None]:
df_bin_train = df_bin.sample(frac=.8,random_state=1)
df_bin_test = df_bin.loc[~df_bin.index.isin(df_bin_train.index)].copy()
df_cont_train = df_cont.sample(frac=.8,random_state=1)
df_cont_test = df_cont.iloc[~df_cont.index.isin(df_cont_train.index)].copy()
len(df_bin_train),len(df_bin_test),len(df_cont_train),len(df_cont_test)

# Modeling

In [None]:
# For refrence, a simple logit based only on target rating gain and latest rating
logit_simple = smf.logit(formula="y_bin~target_rating_gain+rating_latest",data=df_bin_train).fit()
logit_simple.summary()

In [None]:
# target_rating_gain_squared*rating_latest
logit = smf.logit(formula="""
y_bin~target_rating_gain*rating_latest*bullet+target_rating_gain*rating_latest*blitz+
target_rating_gain*rating_latest*classical+

target_rating_gain_squared*rating_latest+

rating_peak_diff*target_rating_gain+rating_180_diff*bullet+rating_180_diff*blitz+

rating_updates_30+

rating_updates_90*bullet+rating_updates_90*blitz+

rating_stdev_90
""",data=df_bin_train).fit()
logit.summary()

In [None]:
# Regression
ols = smf.ols(formula="""
y_cont~
target_rating_gain*rating_latest*bullet+target_rating_gain*rating_latest*blitz+target_rating_gain*rating_latest*classical+
target_rating_gain_squared*bullet+target_rating_gain_squared*blitz+
rating_latest_squared+

rating_peak_diff*bullet+rating_peak_diff*blitz+
rating_peak_diff*target_rating_gain+

rating_180_diff*bullet+rating_180_diff*blitz+

rating_90_diff+

rating_30_diff*bullet+rating_30_diff*classical+

rating_updates_30+rating_updates_90*blitz+rating_updates_90*classical+

rating_stdev_30

""",data=df_cont_train).fit()
ols.summary()

In [None]:
# Intervals
ols.get_prediction(df_cont_train).summary_frame().head()

# Evaluation

## Classification Evaluation

In [None]:
df_bin_test['prob'] = logit.predict(df_bin_test)
df_bin_test['prob_simple'] = logit_simple.predict(df_bin_test)
df_bin_test['prob'].describe()

In [None]:
print(round(df_bin_test['prob'].mean(),3))
print(round(df_bin_test['y_bin'].mean(),3))

In [None]:
# MSE
## Baseline
print(round(((np.ones(len(df_bin_test))-df_bin_test['y_bin'])**2).mean(),3))
## Logit model
print(round(((df_bin_test['prob']-df_bin_test['y_bin'])**2).mean(),3))

In [None]:
for x in df_bin_test['time_control'].unique():
    data = df_bin_test.query("time_control==@x")
    print(x)
    print(round(((data['prob']-data['y_bin'])**2).mean(),3))

In [None]:
df_bin_test['decile'] = pd.qcut(df_bin_test['prob'],q=10)
deciles = df_bin_test.groupby('decile')[['prob','y_bin']].mean().reset_index()
deciles['index'] = np.arange(len(deciles))
decile_probs = deciles[['prob','index']].rename(columns={"prob":"value"})
decile_probs['variable'] = 'Mean Probability'
decile_actuals = deciles[['y_bin','index']].rename(columns={"y_bin":"value"})
decile_actuals['variable'] = 'Actual Proportion'
deciles = pd.concat([decile_probs,decile_actuals],axis=0)
decile_plot = (ggplot(deciles,aes(x='index',y='value',fill='variable')) +
 geom_bar(stat='identity',position='dodge') +
               scale_x_continuous(breaks=list(range(0,10)),
                                 labels=list(range(1,11))) +
               labs(x = "Decile",y = "Value",
                   fill = "Variable", title = "Logit Test Set Deciles")
)
ggsave(decile_plot,"plots/test_decile_plot.png",verbose=False)
decile_plot

In [None]:
(ggplot(df_bin_test.query('y_bin==0'),aes(x='prob')) +
geom_histogram(bins=20))

In [None]:
# Identify the ones with high-prob that are zeros (where we're overpredicting)
df_bin_test.query("prob>=.75&y_bin==0").groupby(["time_control","rating_latest_rounded_300","target_rating_gain_rounded"]).size().sort_values(ascending=False).head(10)

In [None]:
(ggplot(df_bin_test.query('y_bin==1'),aes(x='prob')) +
geom_histogram(bins=20))

In [None]:
(ggplot(df_bin_test,aes(x='prob')) +
geom_histogram(bins=20))

In [None]:
xtabs_bin = df_bin_test.groupby(['time_control','target_rating_gain_rounded','rating_latest_rounded_300'])[['y_bin','prob']].agg([np.mean,len]).iloc[:,:-1].round(2)
xtabs_bin.columns = ['prop_actual','n','mean_prob']
xtabs_bin.reset_index(inplace=True)
xtabs_bin['diff'] = xtabs_bin['mean_prob'] - xtabs_bin['prop_actual']
xtabs_bin['abs_diff'] = xtabs_bin['diff'].abs()
xtabs_bin.query("n>=50").sort_values("abs_diff",ascending=False).head(10)

In [None]:
df_bin_test.groupby(['rating_latest_rounded_200'])[['y_bin','prob']].agg([np.mean,len]).iloc[:,:-1].round(2)


In [None]:
df_bin_test.groupby(['target_rating_gain_rounded'])[['y_bin','prob']].agg([np.mean,len]).round(2)

In [None]:
xtabs_bin.query("time_control=='Classical'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded_300',values='prop_actual').iloc[:,1:-1]

In [None]:
xtabs_bin.query("time_control=='Classical'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded_300',values='mean_prob').iloc[:,1:-1]

In [None]:
xtabs_bin.query("time_control=='Classical'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded_300',values='diff').iloc[:,1:-1]

In [None]:
xtabs_bin.query("time_control=='Classical'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded_300',values='n').iloc[:,1:-1]

### ROC AUC

In [None]:
print(metrics.roc_auc_score(y_true=df_bin_test['y_bin'],y_score=df_bin_test['prob_simple']).round(3))
print(metrics.roc_auc_score(y_true=df_bin_test['y_bin'],y_score=df_bin_test['prob']).round(3))

In [None]:
print(metrics.roc_auc_score(y_true=df_bin_test.query("time_control=='Bullet'")['y_bin'],y_score=df_bin_test.query("time_control=='Bullet'")['prob']).round(2))
print(metrics.roc_auc_score(y_true=df_bin_test.query("time_control=='Blitz'")['y_bin'],y_score=df_bin_test.query("time_control=='Blitz'")['prob']).round(2))
print(metrics.roc_auc_score(y_true=df_bin_test.query("time_control=='Rapid'")['y_bin'],y_score=df_bin_test.query("time_control=='Rapid'")['prob']).round(2))
print(metrics.roc_auc_score(y_true=df_bin_test.query("time_control=='Classical'")['y_bin'],y_score=df_bin_test.query("time_control=='Classical'")['prob']).round(2))

In [None]:
# What about for rating gains that are harder to predict?
df_bin_high_gains = df_bin_test.query("target_rating_gain>=50&target_rating_gain<200")
print(metrics.roc_auc_score(y_true=df_bin_high_gains['y_bin'],y_score=df_bin_high_gains['prob_simple']).round(2))
print(metrics.roc_auc_score(y_true=df_bin_high_gains['y_bin'],y_score=df_bin_high_gains['prob']).round(2))

## Regression Evaluation

In [None]:
df_cont_test['y_cont'].describe()

In [None]:
df_cont_test['pred'] = ols.predict(df_cont_test)
# Set negative predictions to zero and > 2 year predictions to 2 years
df_cont_test.loc[df_cont_test['pred']<0,'pred'] = 0
df_cont_test.loc[df_cont_test['pred']>365*2,'pred'] = 365*2
df_cont_test['pred'].describe().round()

In [None]:
df_cont_test['error'] = df_cont_test['pred']-df_cont_test['y_cont']
df_cont_test['abs_error'] = df_cont_test['error'].abs()

In [None]:
df_cont_test['error'].describe().round()

In [None]:
df_cont_test['abs_error'].describe().round()

In [None]:
df_cont_test.groupby("time_control")['abs_error'].describe().round().astype(int)

In [None]:
error_summary = df_cont_test.groupby(['target_rating_gain_rounded','rating_latest_rounded','time_control'])[['pred','y_cont','abs_error','error']].agg([np.mean]).round().astype(int)
sizes = df_cont_test.groupby(['target_rating_gain_rounded','rating_latest_rounded','time_control']).size().reset_index()
sizes.rename(columns={0:"n"},inplace=True)
error_summary.columns = ['mean_pred','mean_actual','mean_abs_error','mean_error']
error_summary = error_summary.reset_index().merge(sizes,on=['target_rating_gain_rounded','rating_latest_rounded','time_control'])
error_summary = error_summary.query('n>30')
error_summary.sort_values("mean_error",ascending=False).head(10)

In [None]:
error_summary.query("time_control=='Rapid'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='mean_actual')

In [None]:
error_summary.query("time_control=='Rapid'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='mean_pred')

In [None]:
error_summary.query("time_control=='Rapid'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='mean_error')

In [None]:
error_summary.query("time_control=='Classical'").pivot(index='target_rating_gain_rounded',columns='rating_latest_rounded',values='n')

In [None]:
(ggplot(error_summary.query("time_control=='Blitz'&target_rating_gain_rounded<=300")) +
geom_point(aes(x='rating_latest_rounded',y='mean_pred'),color='red')+
 geom_point(aes(x='rating_latest_rounded',y='mean_actual'),color='blue')
)

### Eval for core use cases

In [None]:
# Most users in discord will be rated between 1200-2200...
#... trying to improve between 0-200 rating points...
# in blitz or rapid
# How does the model do with them?
df_cont_core = df_cont_test.query("target_rating_gain<=200&rating_latest>=1200&rating_latest<=2200")
df_cont_core.groupby("time_control")['abs_error'].describe().round()

In [None]:
df_cont_small = df_cont_test.query("target_rating_gain<=75&rating_latest>=1200&rating_latest<=2200")

In [None]:
df_cont_small.groupby("time_control")['abs_error'].describe().round()

### Eval for the worst misses

In [None]:
df_cont_test.sort_values("error",ascending=False).head(15).drop(['y_bin','fast_improver','target_rating_gain_rounded','target_rating_gain_squared','bullet','classical','rapid','blitz','rating_latest_squared','date','rating_latest_rounded','rating_latest_rounded_200','rating_latest_rounded_300','rating_90_diff','rating_peak_diff','rating_180_diff','classical_updates_30','rapid_updates_30','blitz_updates_30','bullet_updates_30','rating_180','rating_stdev_180'],axis=1).round()

## Compare to log model

In [None]:
start_dates = df_training.drop_duplicates(subset=['user_id','time_control'],keep='first').rename(columns={"date":"start_date"})
df_training_merged = df_training.merge(start_dates.drop("rating",axis=1),on=['user_id','time_control'])
df_training_merged['days_since_start'] = (df_training_merged['date']-df_training_merged['start_date'])/timedelta(days=1)
df_training_merged['exp_rating'] = 3**(df_training_merged['rating']/300)
df_training_merged.head()

In [None]:
log_coefs = df_training_merged.groupby(['user_id','time_control','start_date']).apply(lambda x:
    np.polyfit(x['days_since_start'],x['exp_rating'],1) if len(x)>25 else None).reset_index().dropna()
log_coefs.rename(columns={0:"coefs"},inplace=True)
log_coefs['intercept'] = log_coefs['coefs'].apply(lambda x: x[0])
log_coefs['slope'] = log_coefs['coefs'].apply(lambda x: x[1])
log_coefs.head()

In [None]:
df_cont_log_preds = df_cont_test[['user_id','time_control','target_rating','target_rating_gain',
    'y_cont','pred','error','abs_error']].rename(columns={"error":"ols_error","abs_error":"ols_abs_error"}).merge(
    log_coefs.drop("coefs",axis=1),on=['user_id','time_control'],how='inner')
df_cont_log_preds['log_model_days_since_start'] = df_cont_log_preds.apply(lambda x: ((3 ** (x['target_rating']/300)) - x['slope'])/x['intercept'],axis=1)
df_cont_log_preds['max_training_date'] = max_training_date
df_cont_log_preds['days_since_start'] = (df_cont_log_preds['max_training_date'] - df_cont_log_preds['start_date'])/timedelta(days=1)
df_cont_log_preds['log_model_days_pred'] = df_cont_log_preds['log_model_days_since_start']-df_cont_log_preds['days_since_start']
df_cont_log_preds.loc[df_cont_log_preds['log_model_days_pred']<0,"log_model_days_pred"] = 0
df_cont_log_preds.head()

In [None]:
df_cont_log_preds['log_error'] = df_cont_log_preds['log_model_days_pred']-df_cont_log_preds['y_cont']
df_cont_log_preds['log_abs_error'] = df_cont_log_preds['log_error'].abs()
df_cont_log_preds[['ols_error','ols_abs_error','log_error','log_abs_error']].describe().round().astype(int)

In [None]:
print(df_cont_log_preds.shape) # fewer predictions because of >25 days requirement
print(df_cont_test.shape)

In [None]:
df_cont_log_preds.groupby("time_control")[['ols_abs_error','log_abs_error']].agg([np.mean,np.median]).round().astype(int)

In [None]:
# Possible objection is that by definition the answer must be < 2 years, so OLS has unfair advantage
# To address, filter to where log model predictions are < 1 year
under_year_log_preds = df_cont_log_preds.query("log_model_days_pred<=365")
under_year_log_preds.shape

In [None]:
under_year_log_preds[['ols_error','ols_abs_error','log_error','log_abs_error']].describe().round().astype(int)

# Scoring

In [None]:
logit.params.head()

In [None]:
ols.params.head()

In [None]:
model_params=pd.concat([logit.params,ols.params],axis=1).reset_index()
model_params.columns = ['var_name','logit','ols']
#model_params.to_csv("data/model_params_20210825.csv",index=False)
model_params.head()

In [None]:
model_params = pd.read_csv("data/model_params_20210825.csv")
model_params.head()

In [None]:
# Convert rating history info from JSON to Dataframe
def process_rating_history(response_json):
    rating_history = dict()
    for x in response_json:
        if x['name'] in ['Bullet','Blitz','Rapid','Classical']:
            tbl = pd.DataFrame(x['points'])
            if len(tbl) == 0: continue
            tbl.columns = ['year','month','day','rating']
            tbl['month'] = tbl['month']+1
            tbl['date'] = pd.to_datetime(tbl.year*10000+tbl.month*100+tbl.day,format='%Y%m%d')
            rating_history[x['name']] = tbl
    return(rating_history)

# Get the values that are inputs to the models
def get_predictor_values(rating_history,target_rating,target_time_control):
    target_rating_history = rating_history[target_time_control]
    t_minus_30 = datetime.today()-timedelta(days=30)
    t_minus_90 = datetime.today()-timedelta(days=90)
    t_minus_180 = datetime.today()-timedelta(days=180)
    target_rating_history_30 = target_rating_history.query('date>=@t_minus_30')
    target_rating_history_90 = target_rating_history.query('date>=@t_minus_90')
    target_rating_history_180 = target_rating_history.query('date>=@t_minus_180')    
    rating_latest = target_rating_history['rating'].values[-1]
    target_rating_gain = target_rating-rating_latest
    predictor_values = dict(Intercept=1,target_rating_gain=target_rating_gain,
        target_rating_gain_squared=target_rating_gain**2,
        rating_latest=rating_latest,
        rating_latest_squared = rating_latest**2,
        bullet = int(target_time_control == 'Bullet'), blitz = int(target_time_control == 'Blitz'),
        rapid = int(target_time_control == 'Rapid'), classical = int(target_time_control == 'Classical'),
        rating_peak_diff = rating_latest-target_rating_history['rating'].max(),
        rating_30_diff = rating_latest-target_rating_history_30['rating'].values[0],
        rating_90_diff = rating_latest-target_rating_history_90['rating'].values[0],
        rating_180_diff = rating_latest-target_rating_history_180['rating'].values[0],
        rating_updates_30 = len(target_rating_history_30['rating']),
        rating_updates_90 = len(target_rating_history_90['rating']),
        rating_stdev_30 = target_rating_history_30['rating'].std() if len(target_rating_history_30) > 1 else 0,
        rating_stdev_90 = target_rating_history_90['rating'].std() if len(target_rating_history_90) > 1 else 0
                                   )
    return(predictor_values)

# Calculate the probability of success given a set of predictor values and a classification model
def get_prob_success(predictor_values,model_params):
    logit_params = model_params[['var_name','logit']].dropna()
    linear_combo = 0
    for i in range(len(logit_params)):
        var_name = logit_params['var_name'].values[i]
        coef = logit_params['logit'].values[i]
        if ':' in var_name:
            var_names = var_name.split(":")
            value = 1
            for j in var_names:
                value *= predictor_values[j]
        else:
            value = predictor_values[var_name]
        linear_combo += coef*value
    return str(round(100*1/(1+np.exp(-1*(linear_combo)))))+"%"

# Calculate the predicted days until target rating given a set of predictor values and a regression model
def get_predicted_date(predictor_values,model_params):
    ols_params = model_params[['var_name','ols']].dropna()
    predicted_days = 0
    for i in range(len(ols_params)):
        var_name = ols_params['var_name'].values[i]
        coef = ols_params['ols'].values[i]
        if ':' in var_name:
            var_names = var_name.split(":")
            value = 1
            for j in var_names:
                value *= predictor_values[j]
        else:
            value = predictor_values[var_name]
        predicted_days += coef*value
    predicted_date = (datetime.today()+timedelta(days=predicted_days)).strftime(format="%B %d, %Y")
    return(predicted_date)

# Return the predictions based on discord inputs
def score(username,target_rating,target_time_control,model_params):
    url = f'https://lichess.org/api/user/{username}/rating-history'
    response = requests.get(url)
    response_json = response.json()
    if response.status_code != 200:
        return(f"API ERROR: {response.status_code}")
    else:
        rating_history = process_rating_history(response_json)
        predictor_values = get_predictor_values(rating_history,target_rating,target_time_control)
        #print(predictor_values)
        prob_success = get_prob_success(predictor_values,model_params)
        predicted_date = get_predicted_date(predictor_values,model_params)
        return(prob_success,predicted_date)
score(username = "",target_rating = 2000,
      target_time_control = "Rapid",model_params = model_params)
