In [1]:
from math import exp
from itertools import product

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from DataLoader import DataLoader

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
dl = DataLoader.DataLoader('config.json')

In [3]:
df = dl.get_matches(to_id=False)
# ts = dl.get_timeseries()
# ts.shape

In [4]:
df.columns

Index(['id', 'series_id', 'tournament_id', 'event_id', 'version',
       'minor_version', 'dataset_id', 'dataset_version', 'platform_id',
       'ladder_id', 'rated', 'winning_team_id', 'builtin_map_id',
       'map_size_id', 'event_map_id', 'rms_custom', 'rms_seed',
       'fixed_positions', 'played', 'platform_match_id', 'duration',
       'completed', 'postgame', 'type_id', 'difficulty_id', 'population_limit',
       'map_reveal_choice_id', 'cheats', 'speed_id', 'mirror',
       'diplomacy_type', 'team_size', 'starting_resources_id',
       'starting_age_id', 'victory_condition_id', 'all_technologies',
       'version_id', 'multiqueue', 'treaty_length', 'build', 'version_id',
       'starting_palisades', 'starting_town_centers', 'starting_walls',
       'state_reader_interval', 'state_reader_version', 'platform_metadata',
       'water_percent', 'server'],
      dtype='object')

In [5]:
players = dl.get_player_table()

In [6]:
print(players.shape)
players = players.drop(players[~players.human_1 | ~players.human_2].index)
print(players.shape)
players = players.drop(players[players.rate_before_1.isna() | players.rate_before_2.isna()].index)
print(players.shape)

(178869, 94)
(178770, 94)
(178720, 94)


In [7]:
for age in ['feudal', 'castle', 'imperial']:
    for p in [1,2]:
        players['reached_{0}_{1}'.format(age, p)] = ~players['{0}_time_{1}'.format(age, p)].isna() * 1
players = players.fillna(pd.Timedelta(0))

In [10]:
# FIXME: check for counter colors with 'color_id'
cat_feats = ['civilization_id','reached_feudal','reached_castle','reached_imperial']
cat_feats = [x+y for x,y in product(cat_feats, ['_1', '_2'])]

num_feats = ['rate_before', 'start_x','start_y', 'military_score', 'units_killed', 'hit_points_killed','units_lost',
             'buildings_razed','hit_points_razed','buildings_lost','units_converted','economy_score','food_collected',
             'wood_collected','gold_collected','relic_gold','technology_score','feudal_time','castle_time',
             'imperial_time','explored_percent','research_count','research_percent','society_score','total_wonders',
             'total_castles','total_relics','villager_high']
num_feats = [x+y for x,y in product(num_feats, ['_1', '_2'])]

target = players.winner_1

In [11]:
for var in ['feudal_time_1','feudal_time_2','castle_time_1','castle_time_2','imperial_time_1', 'imperial_time_2']:
    players[var] = players[var].apply(pd.Timedelta.total_seconds)

### Age Up Times
categorical encoding?

In [12]:
players.loc[:,['feudal_time_1','feudal_time_2','castle_time_1','castle_time_2','imperial_time_1', 'imperial_time_2']]

Unnamed: 0_level_0,feudal_time_1,feudal_time_2,castle_time_1,castle_time_2,imperial_time_1,imperial_time_2
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,612.0,667.0,1527.0,1414.0,2468.0,2698.0
4,651.0,623.0,1405.0,1410.0,0.0,2283.0
15,614.0,666.0,0.0,0.0,0.0,0.0
18,617.0,673.0,1429.0,1392.0,0.0,0.0
20,633.0,631.0,1525.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1337682,611.0,606.0,1710.0,1653.0,0.0,0.0
1337686,612.0,616.0,1292.0,1259.0,0.0,2898.0
1337688,638.0,626.0,1514.0,1528.0,0.0,0.0
1337689,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
players.isna().sum().sum()

0

In [14]:
cat_mat = pd.get_dummies(players.loc[:,cat_feats], drop_first=True, columns=cat_feats)

In [15]:
X = players.loc[:,num_feats].merge(cat_mat, left_index=True, right_index=True)

In [16]:
X.columns

Index(['rate_before_1', 'rate_before_2', 'start_x_1', 'start_x_2', 'start_y_1',
       'start_y_2', 'military_score_1', 'military_score_2', 'units_killed_1',
       'units_killed_2',
       ...
       'civilization_id_2_28', 'civilization_id_2_29', 'civilization_id_2_30',
       'civilization_id_2_31', 'reached_feudal_1_1', 'reached_feudal_2_1',
       'reached_castle_1_1', 'reached_castle_2_1', 'reached_imperial_1_1',
       'reached_imperial_2_1'],
      dtype='object', length=122)

In [17]:
reg1 = sm.OLS(target, sm.add_constant(X)).fit()
reg1.summary()

0,1,2,3
Dep. Variable:,winner_1,R-squared:,0.601
Model:,OLS,Adj. R-squared:,0.601
Method:,Least Squares,F-statistic:,2207.0
Date:,"Fri, 23 Oct 2020",Prob (F-statistic):,0.0
Time:,16:38:30,Log-Likelihood:,-47564.0
No. Observations:,178720,AIC:,95370.0
Df Residuals:,178597,BIC:,96620.0
Df Model:,122,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4697,0.012,39.728,0.000,0.447,0.493
rate_before_1,0.0004,8.75e-06,51.255,0.000,0.000,0.000
rate_before_2,-0.0004,8.9e-06,-48.516,0.000,-0.000,-0.000
start_x_1,-0.0001,3.89e-05,-3.191,0.001,-0.000,-4.79e-05
start_x_2,9.633e-05,3.89e-05,2.478,0.013,2.01e-05,0.000
start_y_1,-0.0001,3.82e-05,-3.107,0.002,-0.000,-4.38e-05
start_y_2,1.301e-06,3.82e-05,0.034,0.973,-7.35e-05,7.61e-05
military_score_1,2.672e-05,3.4e-06,7.856,0.000,2.01e-05,3.34e-05
military_score_2,-2.285e-05,3.37e-06,-6.771,0.000,-2.95e-05,-1.62e-05

0,1,2,3
Omnibus:,256.356,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,214.901
Skew:,0.009,Prob(JB):,2.16e-47
Kurtosis:,2.831,Cond. No.,6920000.0


In [18]:
reg2 = sm.Logit(target, sm.add_constant(X)).fit()
reg2.summary()

         Current function value: 0.177832
         Iterations: 35


  warn("Maximum Likelihood optimization failed to converge. "


0,1,2,3
Dep. Variable:,winner_1,No. Observations:,178720.0
Model:,Logit,Df Residuals:,178597.0
Method:,MLE,Df Model:,122.0
Date:,"Fri, 23 Oct 2020",Pseudo R-squ.:,0.7434
Time:,16:38:37,Log-Likelihood:,-31782.0
converged:,False,LL-Null:,-123880.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2837,0.153,-1.859,0.063,-0.583,0.015
rate_before_1,0.0037,0.000,29.463,0.000,0.003,0.004
rate_before_2,-0.0034,0.000,-26.798,0.000,-0.004,-0.003
start_x_1,-0.0008,0.001,-1.514,0.130,-0.002,0.000
start_x_2,0.0008,0.001,1.571,0.116,-0.000,0.002
start_y_1,-0.0012,0.001,-2.445,0.014,-0.002,-0.000
start_y_2,-5.48e-05,0.001,-0.107,0.914,-0.001,0.001
military_score_1,0.0019,9.19e-05,20.911,0.000,0.002,0.002
military_score_2,-0.0016,8.53e-05,-18.559,0.000,-0.002,-0.001


In [21]:
for x,y in zip(reg2.params.index, reg2.params.apply(lambda x : 1 / (1 + exp(x)))):
    print(x, '\t\t', y, sep='')

const		0.5704551437092937
rate_before_1		0.49907586870895965
rate_before_2		0.5008516016568052
start_x_1		0.5001968684291513
start_x_2		0.49979558744243197
start_y_1		0.5003123823126021
start_y_2		0.5000136989871015
military_score_1		0.4995194587055655
military_score_2		0.5003958261004772
units_killed_1		0.5189165244216515
units_killed_2		0.4885850859763446
hit_points_killed_1		0.49997322252238846
hit_points_killed_2		0.500029297399423
units_lost_1		0.5124517501047522
units_lost_2		0.4809827947218959
buildings_razed_1		0.9257028123230844
buildings_razed_2		0.8996048649050448
hit_points_razed_1		0.5000110263934595
hit_points_razed_2		0.4999898456104734
buildings_lost_1		0.09762176807302536
buildings_lost_2		0.07633966838625848
units_converted_1		0.5080540745832504
units_converted_2		0.5044732526726912
economy_score_1		0.4989779225458641
economy_score_2		0.501014584186688
food_collected_1		0.49999130820089954
food_collected_2		0.5000140048176277
wood_collected_1		0.5000350162088049
wood_

In [29]:
poly = PolynomialFeatures(interaction_only=True)
cat_mat_int = poly.fit_transform(cat_mat)
poly.get_feature_names()

['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x12',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20',
 'x21',
 'x22',
 'x23',
 'x24',
 'x25',
 'x26',
 'x27',
 'x28',
 'x29',
 'x30',
 'x31',
 'x32',
 'x33',
 'x34',
 'x35',
 'x36',
 'x37',
 'x38',
 'x39',
 'x40',
 'x41',
 'x42',
 'x43',
 'x44',
 'x45',
 'x46',
 'x47',
 'x48',
 'x49',
 'x50',
 'x51',
 'x52',
 'x53',
 'x54',
 'x55',
 'x56',
 'x57',
 'x58',
 'x59',
 'x60',
 'x61',
 'x62',
 'x63',
 'x64',
 'x65',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x0 x5',
 'x0 x6',
 'x0 x7',
 'x0 x8',
 'x0 x9',
 'x0 x10',
 'x0 x11',
 'x0 x12',
 'x0 x13',
 'x0 x14',
 'x0 x15',
 'x0 x16',
 'x0 x17',
 'x0 x18',
 'x0 x19',
 'x0 x20',
 'x0 x21',
 'x0 x22',
 'x0 x23',
 'x0 x24',
 'x0 x25',
 'x0 x26',
 'x0 x27',
 'x0 x28',
 'x0 x29',
 'x0 x30',
 'x0 x31',
 'x0 x32',
 'x0 x33',
 'x0 x34',
 'x0 x35',
 'x0 x36',
 'x0 x37',
 'x0 x38',
 'x0 x39',
 'x0 x40',
 'x0 x41',
 'x0 x42',
 'x0 x43',
 'x0 x44',
 

In [30]:
cat_mat.columns

Index(['civilization_id_1_2', 'civilization_id_1_3', 'civilization_id_1_4',
       'civilization_id_1_5', 'civilization_id_1_6', 'civilization_id_1_7',
       'civilization_id_1_8', 'civilization_id_1_9', 'civilization_id_1_10',
       'civilization_id_1_11', 'civilization_id_1_12', 'civilization_id_1_13',
       'civilization_id_1_14', 'civilization_id_1_15', 'civilization_id_1_16',
       'civilization_id_1_17', 'civilization_id_1_18', 'civilization_id_1_19',
       'civilization_id_1_20', 'civilization_id_1_21', 'civilization_id_1_22',
       'civilization_id_1_23', 'civilization_id_1_24', 'civilization_id_1_25',
       'civilization_id_1_26', 'civilization_id_1_27', 'civilization_id_1_28',
       'civilization_id_1_29', 'civilization_id_1_30', 'civilization_id_1_31',
       'civilization_id_2_2', 'civilization_id_2_3', 'civilization_id_2_4',
       'civilization_id_2_5', 'civilization_id_2_6', 'civilization_id_2_7',
       'civilization_id_2_8', 'civilization_id_2_9', 'civilization

In [27]:
reg3 = sm.OLS(target, cat_mat_int).fit()
reg3.summary()

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,winner_1,R-squared:,0.164
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,25.97
Date:,"Fri, 23 Oct 2020",Prob (F-statistic):,0.0
Time:,16:44:04,Log-Likelihood:,-113750.0
No. Observations:,178720,AIC:,230200.0
Df Residuals:,177383,BIC:,243700.0
Df Model:,1336,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5541,0.070,7.930,0.000,0.417,0.691
x1,-0.0105,0.080,-0.131,0.896,-0.168,0.147
x2,-0.0645,0.080,-0.805,0.421,-0.221,0.092
x3,-0.0636,0.078,-0.817,0.414,-0.216,0.089
x4,-0.1457,0.078,-1.877,0.061,-0.298,0.006
x5,-0.0786,0.079,-1.000,0.318,-0.233,0.076
x6,-0.1046,0.077,-1.364,0.173,-0.255,0.046
x7,-0.0197,0.078,-0.252,0.801,-0.173,0.133
x8,-0.1550,0.077,-2.018,0.044,-0.305,-0.004

0,1,2,3
Omnibus:,949095.948,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17993.015
Skew:,0.017,Prob(JB):,0.0
Kurtosis:,1.446,Cond. No.,3.25e+17


In [None]:
dl.civ[:31]

In [None]:
' + '.join(num_feats)

In [26]:
cat_mat_int

array([[1., 0., 0., ..., 1., 1., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [None]:
X.merge(target, left_index=True, right_index=True)