In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [64]:
def get_data(file_name='../fpl_data/data/DataForModel2.csv'):
    df = pd.read_csv(file_name)
    return df


In [65]:
df = get_data()
df.shape

(44895, 68)

In [66]:

def x_y_split(df):
    to_remove = ['total_points_nextweek','total_minutes_nextweek']
    x = df.drop(to_remove,axis=1)
    y = df['total_points_nextweek']
    return x,y
def split_train_test(df):
    df = df.sort_values(by='kickoff_time')
    df['is_home_nextweek'] = df['is_home_nextweek'].map(lambda x: 0 if x=='false' else 1)
    #dropping unnecessary columns
    unwanted = ['kickoff_time', 'kickoff_time_formatted','name','opponent_name','team_name','opponent_nextweek']
    df = df.fillna(0)
    df = df.drop(unwanted,axis=1)
    df_x,df_y = x_y_split(df)
    train_x,test_x, train_y,test_y = train_test_split(df_x,df_y,test_size=0.2)
    return train_x,test_x,train_y,test_y
    
    


In [67]:
train_x,test_x,train_y,test_y = split_train_test(df)

In [70]:
def tree_train(x,y):
    parameters = {"max_depth":[3,4,5,6,7], "min_child_weight":[1,2,3]}
    model = XGBRegressor()
    gs = GridSearchCV(model,param_grid=parameters,cv=5,verbose=True)
    gs.fit(x,y)
    return gs

In [71]:
cf = tree_train(train_x,train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.5min finished


In [None]:
types= train_x.dtypes
wrong_cols = []
for key,ele in types.iteritems():
    if (ele=='object'):
        wrong_cols.append(key)

In [None]:
wrong_cols

## columns

In [78]:
list(test_x)

['assists',
 'attempted_passes',
 'big_chances_created',
 'big_chances_missed',
 'bonus',
 'bps',
 'clean_sheets',
 'clearances_blocks_interceptions',
 'completed_passes',
 'creativity',
 'dribbles',
 'ea_index',
 'element',
 'errors_leading_to_goal',
 'errors_leading_to_goal_attempt',
 'fixture',
 'fouls',
 'goals_conceded',
 'goals_scored',
 'ict_index',
 'id',
 'influence',
 'key_passes',
 'loaned_in',
 'loaned_out',
 'minutes',
 'offside',
 'open_play_crosses',
 'opponent_team',
 'own_goals',
 'penalties_conceded',
 'penalties_missed',
 'penalties_saved',
 'recoveries',
 'red_cards',
 'round',
 'saves',
 'selected',
 'tackled',
 'tackles',
 'target_missed',
 'team_a_score',
 'team_h_score',
 'threat',
 'total_points',
 'transfers_balance',
 'transfers_in',
 'transfers_out',
 'value',
 'was_home',
 'winning_goals',
 'yellow_cards',
 'playingfor',
 'opponent_strength',
 'team_strength',
 'season',
 'total_points_prevweek',
 'opponent_strength_nextweek',
 'is_home_nextweek',
 'total_p

## Prediction and Error Calculation

In [80]:
prediction = cf.predict(test_x)
error = np.sqrt(mean_squared_error(prediction,test_y))

##### Total error

In [81]:
print(error)
print('Total elements:{}'.format(test_x.shape[0]))

2.0599689537533807
Total elements:8979


In [76]:
print(prediction-test_y)

1570     0.158022
654      0.043565
43485   -4.174461
31755    0.296156
10305    1.243990
17367    0.097414
44480   -0.043133
29880    1.031450
10505    0.012191
12591   -8.259714
30462    0.048998
27744    0.461420
43928    2.952735
14514    0.191382
28402    0.129005
35143   -6.267547
33907    0.173820
28479    0.061291
24676    0.067854
37837   -0.041792
5801     0.695483
17194    0.083822
8889    -0.764929
5162     1.071493
23192    0.719242
42607    2.056534
28188    0.146592
28518    0.097142
10601    0.865299
3236     0.273362
           ...   
38205    0.414293
10433    0.412997
41253   -0.949228
18611    1.349068
8174     0.090907
24396    0.236656
38059    0.184834
18379    0.198383
35710    0.217542
35837    0.242254
24970    0.175749
16394    0.271730
1777    -1.895874
14337    1.714687
29772    0.155961
19069   -1.159035
28496    0.024865
16537   -0.024041
37248    0.108495
12948    1.800916
16305    0.143037
28411    0.075022
1748     1.414240
21535    0.189876
34001    0

In [77]:
print(test_x)

       assists  attempted_passes  big_chances_created  big_chances_missed  \
1570         0                 0                    0                   0   
654          0                 0                    0                   0   
43485        0                 4                    0                   0   
31755        0                 0                    0                   0   
10305        0                58                    0                   0   
17367        0                 0                    0                   0   
44480        0                 0                    0                   0   
29880        0                41                    0                   0   
10505        0                 0                    0                   0   
12591        0                14                    0                   0   
30462        0                 0                    0                   0   
27744        0                 0                    0                   0   