In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [98]:
def get_data(file_name='../fpl_data/data/DataForModel2.csv'):
    df = pd.read_csv(file_name)
    return df


In [99]:
df = get_data()
df.shape

(44895, 68)

In [100]:

def x_y_split(df):
    to_remove = ['total_points_nextweek','total_minutes_nextweek']
    x = df.drop(to_remove,axis=1)
    y = df['total_points_nextweek']
    return x,y
def split_train_test(df):
    df = df.sort_values(by='kickoff_time')
    df['is_home_nextweek'] = df['is_home_nextweek'].map(lambda x: 0 if x=='false' else 1)
    #dropping unnecessary columns
    unwanted = ['kickoff_time', 'kickoff_time_formatted','name','opponent_name','team_name','opponent_nextweek']
    df = df.fillna(0)
    
    df_x,df_y = x_y_split(df)
    train_x,test_x, train_y,test_y = train_test_split(df_x,df_y,test_size=0.2)
    
    df_test_uw = test_x[unwanted]
    train_x = train_x.drop(unwanted,axis=1)
    test_x = test_x.drop(unwanted,axis=1)
    return train_x,test_x,train_y,test_y,df_test_uw
    
    


In [101]:
train_x,test_x,train_y,test_y,test_other = split_train_test(df)

In [102]:
def tree_train(x,y):
    parameters = {"max_depth":[3,4,5,6,7], "min_child_weight":[1,2,3]}
    model = XGBRegressor()
    gs = GridSearchCV(model,param_grid=parameters,cv=5,verbose=True)
    gs.fit(x,y)
    return gs

In [103]:
cf = tree_train(train_x,train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.6min finished


In [104]:
types= train_x.dtypes
wrong_cols = []
for key,ele in types.iteritems():
    if (ele=='object'):
        wrong_cols.append(key)

In [105]:
wrong_cols

[]

# combine the test result

In [106]:
test_other['predicted']  = prediction
test_other['actual'] = test_y
test_other.to_csv('weekly_precition.csv')

## columns

In [107]:
list(test_x)

['assists',
 'attempted_passes',
 'big_chances_created',
 'big_chances_missed',
 'bonus',
 'bps',
 'clean_sheets',
 'clearances_blocks_interceptions',
 'completed_passes',
 'creativity',
 'dribbles',
 'ea_index',
 'element',
 'errors_leading_to_goal',
 'errors_leading_to_goal_attempt',
 'fixture',
 'fouls',
 'goals_conceded',
 'goals_scored',
 'ict_index',
 'id',
 'influence',
 'key_passes',
 'loaned_in',
 'loaned_out',
 'minutes',
 'offside',
 'open_play_crosses',
 'opponent_team',
 'own_goals',
 'penalties_conceded',
 'penalties_missed',
 'penalties_saved',
 'recoveries',
 'red_cards',
 'round',
 'saves',
 'selected',
 'tackled',
 'tackles',
 'target_missed',
 'team_a_score',
 'team_h_score',
 'threat',
 'total_points',
 'transfers_balance',
 'transfers_in',
 'transfers_out',
 'value',
 'was_home',
 'winning_goals',
 'yellow_cards',
 'playingfor',
 'opponent_strength',
 'team_strength',
 'season',
 'total_points_prevweek',
 'opponent_strength_nextweek',
 'is_home_nextweek',
 'total_p

## Prediction and Error Calculation

In [108]:
prediction = cf.predict(test_x)
error = np.sqrt(mean_squared_error(prediction,test_y))

##### Total error

In [109]:
print(error)
print('Total elements:{}'.format(test_x.shape[0]))

2.077974800802113
Total elements:8979


In [110]:
print(prediction-test_y)

38044    0.537433
20079    0.057762
25589    1.945394
22199    0.232600
32860    2.677429
29838    0.578838
5810     1.455738
8877    -0.004677
28073    0.590057
28223    0.874575
27662    0.268165
40870    0.186882
42615    0.632031
33158    0.070302
30253    0.281104
1438     0.037096
35660    1.333802
12573    0.572241
1098     1.044055
38075    0.933124
35016   -2.454310
25203    0.290715
9257    -0.488704
22501    0.178434
24791    0.018943
14780    0.288567
18869   -6.519933
40675    2.021937
1992    -6.382109
29484    0.101786
           ...   
19550    0.066634
10037    0.284838
22010    0.007812
16977    0.118521
36299    0.133560
17829    0.299737
20366    0.324788
40872   -1.798840
19684    0.217758
37934    2.594577
33597   -1.722816
6985     0.807753
10000   -1.059226
23105   -0.169771
4627     0.480128
19236    0.059048
12520   -3.468194
32427    0.351915
21172    0.127874
8702     1.439566
9906     0.154836
32771    0.088467
9615     0.080476
13302   -7.209399
18425    0

In [111]:
print(test_x)

       assists  attempted_passes  big_chances_created  big_chances_missed  \
38044        0                 0                    0                   0   
20079        0                 0                    0                   0   
25589        0                 0                    0                   0   
22199        0                 0                    0                   0   
32860        0                38                    0                   0   
29838        0                60                    0                   0   
5810         0                 0                    0                   0   
8877         0                 0                    0                   0   
28073        0                 3                    0                   0   
28223        1                35                    0                   0   
27662        0                 5                    0                   0   
40870        0                 0                    0                   0   

# Feature Importance

In [112]:
cols = list(test_x)
ft_importance = cf.best_estimator_.feature_importances_
imp_index = np.argsort(ft_importance)[::-1]

for i in range(10):
    index = imp_index[i]
    print('feature name={}  \t\t importance={}'.format(cols[index],ft_importance[index]))

feature name=value  		 importance=0.09637681394815445
feature name=opponent_strength_nextweek  		 importance=0.07608695328235626
feature name=selected  		 importance=0.05797101557254791
feature name=fixture  		 importance=0.04855072498321533
feature name=transfers_in  		 importance=0.04565217345952988
feature name=total_points_prevweek  		 importance=0.04202898591756821
feature name=bps  		 importance=0.041304346174001694
feature name=transfers_balance  		 importance=0.03913043439388275
feature name=total_points_twoweeksago  		 importance=0.036231882870197296
feature name=ict_index  		 importance=0.03550724685192108
