In [44]:

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error

In [45]:
# run xgboost 
def tree_train(x,y):
    parameters = {"max_depth":[3,4,5,6,7,8,9], "min_child_weight":[1,2,3]}
    model = XGBRegressor()
    gs = GridSearchCV(model,param_grid=parameters,cv=10)
    gs.fit(x,y)
    return gs





In [46]:
def read_data(base_folder='../fpl_data/data',year='2016-17'):
    cleaned_file = base_folder+'/'+year+'/cleaned_players.csv'
    raw_file = base_folder+'/'+year+'/players_raw.csv'
    raw_df = pd.read_csv(raw_file,encoding = "ISO-8859-1")
    #select limited columns only
    raw_df = raw_df[['first_name','second_name','element_type']]
    clean_df = pd.read_csv(cleaned_file,encoding = "ISO-8859-1")
    
    full_df = pd.merge(clean_df,raw_df,on=['first_name','second_name'])
    return full_df

In [47]:
df_2016 = read_data(year='2016-17')
df_2017 = read_data(year='2017-18')
df_2018 = read_data(year='2018-19')

In [48]:
list(df_2016)

['first_name',
 'second_name',
 'goals_scored',
 'assists',
 'total_points',
 'minutes',
 'goals_conceded',
 'creativity',
 'influence',
 'threat',
 'bonus',
 'bps',
 'ict_index',
 'clean_sheets',
 'red_cards',
 'yellow_cards',
 'selected_by_percent',
 'element_type']

## Merge two year to get the y

In [51]:
def merge_df_year(df_1,df_2):
    df_2 = df_2[['first_name','second_name','total_points']]
    # add column name as total_points_new. This is the y value.
    df_2['total_points_next'] = df_2['total_points']
    df_2.drop('total_points',axis=1,inplace=True)
    
    df = pd.merge(df_1,df_2, on =['first_name','second_name'])
    return df

In [52]:
df_2016_17_combined = merge_df_year(df_2016,df_2017)
df_2017_18_combined = merge_df_year(df_2017,df_2018)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# Create train and test data. Data sample is less

In [53]:
def clean_data(x,cols):
    unwanted_cols = ['first_name','second_name','total_points_next']
    for c in list(x):
        if c not in cols:
            unwanted_cols.append(c)
    return x.drop(unwanted_cols,axis=1)

In [54]:
cols = list(df_2016_17_combined)

train_y = df_2016_17_combined['total_points_next']
train_x = clean_data(df_2016_17_combined,cols)
test_y = df_2017_18_combined['total_points_next']
test_x = clean_data(df_2017_18_combined,cols)

#print(list(train_x))

In [55]:
print(cols)

['first_name', 'second_name', 'goals_scored', 'assists', 'total_points', 'minutes', 'goals_conceded', 'creativity', 'influence', 'threat', 'bonus', 'bps', 'ict_index', 'clean_sheets', 'red_cards', 'yellow_cards', 'selected_by_percent', 'element_type', 'total_points_next']


# Run training

In [56]:
clf = tree_train(train_x,train_y)



In [57]:
prediction = clf.predict(test_x)

In [58]:
error = mean_squared_error(prediction,test_y)

In [59]:
print(error)

1787.353617568452


In [60]:
def write_result(test_x,test_y,predicted,file='next_year.csv'):
    first_name =test_x['first_name']
    second_name = test_x['second_name']
    df = pd.DataFrame()
    df['first_name'] = first_name
    df['second_name'] = second_name
    df['actual'] = test_y
    df['predicted'] = predicted
    df['element_type'] = test_x['element_type']
    df['diff'] = (df['actual']-df['predicted']).abs()
    df.to_csv(file,index=False)
    return df
    

In [61]:
final_combined = write_result(df_2017_18_combined,test_y,prediction)

# Get players

In [62]:
def select_players(df):
    df = df.sort_values(by=['predicted'],ascending=False)
    result = [[] for x in range(4)]
    required = [2,5,5,3]
    for index,row in df.iterrows():
        f_name = row['first_name']
        s_name = row['second_name']
        pred = row['predicted']
        actual = row['actual']
        e_index = int(row['element_type'])-1
        if(len(result[e_index])<required[e_index]):
            item = {}
            item['first_name'] =f_name
            item['second_name'] =  s_name
            item['prediction'] = pred
            item['actual'] = actual
            result[e_index].append(item)
            
    return result
def print_result(result):
    types = ['Goali','Defender','Mid','Attacker']
    for i in range(len(result)):
        print('==================SELECTED {}==================='.format(types[i]))
        for player in result[i]:
            print('{},{}\t\t\t,{}\n'.format(player['first_name'],player['second_name'],player['prediction']))

In [63]:
s_list = select_players(final_combined)

In [64]:
print_result(s_list)

Hugo,Lloris			,108.55298614501953

Asmir,Begovic			,101.83666229248047

Kyle,Walker			,137.86509704589844

Marcos,Alonso			,135.66845703125

Antonio,Valencia			,122.68118286132812

Chris,Smalling			,100.00498962402344

Victor,Moses			,96.46646118164062

Raheem,Sterling			,208.72373962402344

Kevin,De Bruyne			,186.9269256591797

Leroy,SanÃ©			,167.38226318359375

David,Silva			,159.832763671875

Heung-Min,Son			,157.26902770996094

Sergio,AgÃ¼ero			,156.41250610351562

Roberto,Firmino			,153.01254272460938

Romelu,Lukaku			,151.39073181152344



In [65]:
print(s_list)

[[{'first_name': 'Hugo', 'second_name': 'Lloris', 'prediction': 108.55298614501953, 'actual': 47}, {'first_name': 'Asmir', 'second_name': 'Begovic', 'prediction': 101.83666229248047, 'actual': 48}], [{'first_name': 'Kyle', 'second_name': 'Walker', 'prediction': 137.86509704589844, 'actual': 62}, {'first_name': 'Marcos', 'second_name': 'Alonso', 'prediction': 135.66845703125, 'actual': 93}, {'first_name': 'Antonio', 'second_name': 'Valencia', 'prediction': 122.68118286132812, 'actual': 10}, {'first_name': 'Chris', 'second_name': 'Smalling', 'prediction': 100.00498962402344, 'actual': 34}, {'first_name': 'Victor', 'second_name': 'Moses', 'prediction': 96.46646118164062, 'actual': 2}], [{'first_name': 'Raheem', 'second_name': 'Sterling', 'prediction': 208.72373962402344, 'actual': 104}, {'first_name': 'Kevin', 'second_name': 'De Bruyne', 'prediction': 186.9269256591797, 'actual': 2}, {'first_name': 'Leroy', 'second_name': 'SanÃ©', 'prediction': 167.38226318359375, 'actual': 81}, {'first_n