In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as mat

In [None]:
fifa19 = pd.read_csv('data.csv')
fifa20 = pd.read_csv('fifa20_data.csv')

In [None]:
fifa20.columns


In [None]:
fifa19.columns

In [None]:
fifa20.head()

In [None]:
fifa19.head()

# Dropping unwanted columns from FIFA19 dataset

In [None]:
fifa19 = fifa19.drop(fifa19[['ID','Unnamed: 0','Value','Height','Weight','Wage','Weak Foot','Special',
                 'Preferred Foot','Skill Moves','Work Rate','Body Type','Photo','Nationality','Flag',
                'Club Logo','Real Face','Jersey Number','Joined','Loaned From','Contract Valid Until',
                'Release Clause']],axis=1)

# Removing FIFA 19 columns (if exist and match) from FIFA 20 columns and vice versa

In [None]:
difcol20 = fifa20.columns.difference(fifa19.columns)
difcol19 = fifa19.columns.difference(fifa20.columns)

In [None]:
difcol20

In [None]:
difcol19

# Renaming columns in FIFA20 to match with the FIFA 19 columns and removing the unmatched columns

In [None]:
fifa20.rename(columns={'Ball Control': 'BallControl','FK Accuracy': 'FKAccuracy','GK Diving': 'GKDiving','GK Positioning': 'GKPositioning','GK Handling': 'GKHandling','GK Reflexes': 'GKReflexes','Heading Accuracy': 'HeadingAccuracy',
                       'Short Passing': 'ShortPassing','Shot Power': 'ShotPower','Sliding Tackle': 'SlidingTackle','Sprint Speed':
                      'SprintSpeed','Standing Tackle': 'StandingTackle','Long Passing':'LongPassing','Long Shots':
                       'LongShots'},inplace=True)
difcol20 = fifa20.columns.difference(fifa19.columns)
fifa20=fifa20.drop(fifa20[difcol20],axis=1)
difcol19 = fifa19.columns.difference(fifa20.columns)
fifa19 = fifa19.drop(fifa19[difcol19],axis=1)

In [None]:
fifa20.columns

In [None]:
fifa19.columns

# Creating a new column for filtering goal keepers from non-GK players

In [None]:
fifa19['New Position']=0
fifa19['New Position'][fifa19['Position']=='GK']='GK'
fifa20['New Position']=0
fifa20['New Position'][fifa20['Position']=='GK']='GK'

# Two new dataframes containing GK statistics from FIFA 19 and 20

In [None]:
gk19 = fifa19[fifa19['New Position']=='GK']
gk20 = fifa20[fifa20['New Position']=='GK']

In [None]:
gk20.head()

# Removing attributes that are not related to Goal keepers

In [None]:
gk19 = gk19.drop(['Name','Crossing','Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling','Curve','FKAccuracy',
              'LongPassing','BallControl','Acceleration','SprintSpeed', 'Agility', 'Balance', 'ShotPower',
              'LongShots','Interceptions','Positioning','Vision','Penalties','Marking','StandingTackle','SlidingTackle',
             'Aggression','Stamina'],
            axis=1)

gk20 = gk20.drop(['Name','Crossing','Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling','Curve','FKAccuracy',
              'LongPassing','BallControl','Acceleration','SprintSpeed', 'Agility', 'Balance', 'ShotPower',
              'LongShots','Interceptions','Positioning','Vision','Penalties','Marking','StandingTackle','SlidingTackle',
             'Aggression','Stamina'],
            axis=1)

# Appending "GK" to column names to identify easily

In [None]:
gk19.columns = [str(col)+ " GK" for col in gk19.columns]
gk19['Club']=gk19['Club GK']
gk19=gk19.drop(['Club GK'],axis=1)
gk19.head()


In [None]:
gk20.columns = [str(col) + " GK" for col in gk20.columns]
gk20['Club']=gk20['Club GK']
gk20=gk20.drop(['Club GK'],axis=1)

In [None]:
gk20.head()

# Group all GKs under each club by the mean values of each attribute and sort them using 'Overall GK' attribute

In [None]:
gk19teams = gk19.groupby('Club').mean().sort_values('Overall GK',ascending=False)

In [None]:
gk19teams.reset_index(inplace=True)
gk19teams.head()
#gk19teams.head()

In [None]:
gk20teams = gk20.groupby('Club').mean().sort_values('Overall GK',ascending=False)
gk20teams

In [None]:
gk20teams.reset_index(inplace=True)
gk20teams.head()

# Creating dataframes for non-GK positions

In [None]:
notgk19= fifa19[fifa19['New Position']!= 'GK']
notgk20 = fifa20[fifa20['New Position']!= 'GK']

In [None]:
notgk19.columns

# Dropping GK attributes

In [None]:
notgk19 = notgk19.drop(['Name','GKDiving','GKHandling','GKPositioning','GKReflexes'],axis=1)
notgk20 = notgk20.drop(['Name','GKDiving','GKHandling','GKPositioning','GKReflexes'],axis=1)

In [None]:
notgk19.head()

# Group all players under each club by the mean values of each attribute and sort them using 'Overall' attribute

In [None]:
notgk19teams = notgk19.groupby('Club').mean().sort_values('Overall',ascending=False)
notgk20teams = notgk20.groupby('Club').mean().sort_values('Overall',ascending=False)
notgk19teams

In [None]:
#notgk19teams['Club'] = notgk19teams.index
notgk19teams.reset_index(inplace=True)
notgk19teams.head()

In [None]:
notgk20teams.reset_index(inplace=True)
notgk20teams.head()

# Merging the GK dataframe and non-GK dataframe based on club

In [None]:
#notgk19teams['Club']
teams19=pd.merge(notgk19teams,gk19teams,'right','Club')
teams20=pd.merge(notgk20teams,gk20teams,'right','Club')

# Dropping a few more attributes that do not affect the overall perfromance

In [None]:
teams19 = teams19.drop(["Potential GK","Jumping GK","GKHandling GK","GKPositioning GK","Reactions GK",
                        "Composure GK","GKDiving GK","Volleys","Curve","FKAccuracy","Jumping","LongShots","Penalties",],axis=1)
teams20 = teams20.drop(["Potential GK","Jumping GK","GKHandling GK","GKPositioning GK","Reactions GK",
                        "Composure GK","GKDiving GK","Volleys","Curve","FKAccuracy","Jumping","LongShots","Penalties",],axis=1)

# Selecting the set of clubs from English Premier League 

In [None]:
teams_19 = teams19.loc[teams19['Club'].isin(['Manchester City','Liverpool','Chelsea','Tottenham Hotspur',
                                         'Arsenal','Manchester United','Wolverhampton Wanderers',
                                         'Everton','Leicester City','West Ham United','Watford','Crystal Palace',
                                         'Newcastle United','Bournemouth','Burnley','Southampton','Brighton & Hove Albion',
                                         'Cardiff City','Fulham','Huddersfield Town','Sheffield United','Norwich City','Aston Villa'])]
teams_20 = teams20.loc[teams20['Club'].isin(['Manchester City','Liverpool','Chelsea','Tottenham Hotspur',
                                         'Arsenal','Manchester United','Wolverhampton Wanderers',
                                         'Everton','Leicester City','West Ham United','Watford','Crystal Palace',
                                         'Newcastle United','Bournemouth','Burnley','Southampton','Brighton & Hove Albion',
                                         'Cardiff City','Fulham','Huddersfield Town','Sheffield United','Norwich City','Aston Villa'])]

In [None]:
teams_20

In [None]:
teams_19

In [None]:
teams_20

In [None]:
teams_19.index = range(23)
teams_20.index = range(23)


# Merging both FIFA 19 and 20 tables based on club

In [None]:
both = (teams_19.merge(teams_20,left_on='Club',right_on='Club'))
both

# Keeping the club column in a separate dataframe

In [None]:
d1 = both.iloc[:,0:1]

print(d1)
print(type(d1))

# Keeping FIFA 19 and 20 stats in a separate dataframe

In [None]:
d2 = both.iloc[:,1:31]
d2

# Keeping the column names same for both dataframes

In [None]:
d3 = both.iloc[:,31:]
d3.columns = list(d2.columns)
d3

# Combining 2 dataframes and calculating the mean of each attribute

In [None]:
d4 = pd.concat([d2,d3])
d5 = d4.groupby(d4.index)
d6 = d5.mean()
d6.head()
d6['Club'] = both['Club']
club = d6['Club']
d6.drop(labels=['Club'],axis=1,inplace=True)
d6.insert(0,'Club',club)
d6
d6.columns = list(teams_20.columns)
d6

# Importing previous season(2018-19) results of EPL 

In [None]:
uk = pd.read_csv('UK.csv',sep=';',encoding='latin-1')

In [None]:
uk.head()

# Retaining only relevant columns

In [None]:
uk = uk[['HomeTeam','AwayTeam','FTHG','FTAG','FTR']]
uk.head()

In [None]:
uk['HomeTeam'].describe()

# Creating Homestats and Awaystats dataframes from stats dataframe

In [None]:
HomeStats = d6
HomeStats = HomeStats.add_prefix('Home ')
AwayStats = d6
AwayStats = AwayStats.add_prefix('Away ')


In [None]:
HomeStats = HomeStats.rename(columns={'Home Club':'Club'})
HomeStats.head()

In [None]:
AwayStats = AwayStats.rename(columns={'Away Club':'Club'})
AwayStats.head()

# Merging the results and Homestats dataframe

In [None]:
res1 = pd.merge(uk,HomeStats,'left',left_on='HomeTeam',right_on='Club')

In [None]:
res1.head()

# Merging the AwayTeam stats to the res1 dataframe

In [None]:
alltable = pd.merge(res1,AwayStats, 'left',left_on = 'AwayTeam',right_on='Club')
alltable.head()

# Checking for NULL values

In [None]:
nan = alltable[alltable['Club_x'].isna()]
nan['HomeTeam'].unique()

In [None]:
alltable.info()

# Removing club names to have pure data

In [None]:
table = alltable.drop(columns=['HomeTeam','AwayTeam','Club_x','Club_y'])
table.head()

# Replacing values in FTR columns to 1,2,0 for home win,away win and a draw respectively

In [None]:
table['FTR'] = table['FTR'].replace(['H','A','D'],[1,2,0])
table.head()

# Dropping the total goals scored columns as we need only the results

In [None]:
tablek=table.iloc[:,2:]
tablek.info()

# Standardizing all the columns 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(tablek.iloc[:,1:])
scaled_feat = scaler.transform(tablek.iloc[:,1:])
tablek_feat = pd.DataFrame(scaled_feat,tablek.iloc[:,1:])
X = tablek_feat
y = tablek['FTR']

# Splitting the data into train and test set with 60% and 40% respectively

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.4, random_state=8)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

# Plotting Error rate graph to find out the suitable value for KNeighbors

In [None]:
error_rate=[]

for i in range(1,50):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

mat.figure(figsize=(10,6))
mat.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
mat.title('Error Rate vs. K Value')
mat.xlabel('K')
mat.ylabel('Error Rate')

# Plotting the accuracy scores to pick the KNeighbor value

In [None]:
from sklearn import metrics
k_range= range(1,50)

scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))

print(scores)

mat.plot(k_range, scores)
mat.xlabel('Value of K for KNN')
mat.ylabel('Testing Accuracy')

# Running the algorithm with KNeighbor 9 as it gives 58% accuracy

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

# Importing the 2019-20 EPL season schedule

In [None]:
epl = pd.read_excel('epl.xls')

# Merging the schedule with stats dataframe

In [None]:
table1 = pd.merge(epl,HomeStats,'left',left_on='HOME_TEAM',right_on='Club')
table1.head()
eplmatches = pd.merge(table1,AwayStats,'left',left_on='AWAY_TEAM',right_on='Club')
eplmatches

In [None]:
eplmatches.info()

# Dropping the duplicate columns

In [None]:
eplmatches=eplmatches.drop(['Club_x','Club_y'],axis=1)
eplmatches

# Normalizing the data

In [None]:
scaler.fit(eplmatches.iloc[:,2:])
scaled_feat=scaler.transform(eplmatches.iloc[:,2:])
tablecl_feat=pd.DataFrame(scaled_feat,eplmatches.iloc[:,2:])
Xcl = tablecl_feat
predcl=knn.predict(Xcl)

# Predicting the results

In [None]:
eplmatches['Results']=predcl
eplresults=eplmatches[['HOME_TEAM','AWAY_TEAM','Results']]
eplresults['Homepts']=0
eplresults['Awaypts']=0
eplresults['Homepts'][eplresults['Results']==1]=3
eplresults['Awaypts'][eplresults['Results']==2]=3
eplresults['Homepts'][eplresults['Results']==0]=1
eplresults['Awaypts'][eplresults['Results']==0]=1
eplresults

# Grouping home points and away points

In [None]:
hpts=eplresults.groupby(['HOME_TEAM']).sum()
hpts=hpts.drop(['Awaypts','Results'],axis=1)
apts=eplresults.groupby(['AWAY_TEAM']).sum()
apts=apts.drop(['Homepts','Results'],axis=1)
apts
#apts

# Generating the final table

In [None]:
hpts.reset_index(inplace=True)
apts.reset_index(inplace=True)
clpred = pd.concat([hpts,apts],axis=1)
clpred['Total Points']=clpred['Homepts']+clpred['Awaypts']
clpred=clpred.drop(columns=['Homepts','Awaypts','AWAY_TEAM'],axis=1)
clpred=clpred.iloc[:,~clpred.columns.duplicated()]
clpred
clpred=clpred.groupby(['HOME_TEAM']).sum()
clpred.sort_values(['Total Points'],ascending=False)
