In [2]:
import pandas as pd 
import numpy as np 
import scipy as sp 
import matplotlib.pyplot as plt 

Reading and Analysis of whole data

In [3]:
d1 = pd.read_csv('data.csv')

In [4]:
d1.shape

(30697, 28)

In [5]:
d1.columns

Index(['Unnamed: 0', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'remaining_sec', 'distance_of_shot', 'is_goal', 'area_of_shot',
       'shot_basics', 'range_of_shot', 'team_name', 'date_of_game',
       'home/away', 'shot_id_number', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id', 'team_id', 'remaining_min.1',
       'power_of_shot.1', 'knockout_match.1', 'remaining_sec.1',
       'distance_of_shot.1'],
      dtype='object')

In [6]:
#dropping column team name, team id because it's same throughout and seconds don't hold much importance in goal detection
d1 = d1.drop(['team_name','team_id','shot_id_number','remaining_sec','remaining_sec.1'],axis=1)

In [7]:
#filling in the missing values in shot_id_number column 
d1['Unnamed: 0'] += 1
d1 = d1.rename(columns={'Unnamed: 0': 'shot_id_number'})

Distance_of_shot Analysis

In [8]:
#to see whether the values are same or not 
d2 = d1[['distance_of_shot','distance_of_shot.1']]
d2 = d2.rename(columns={'distance_of_shot.1':'distance_of_shot_1'})
d2.sample(7)

Unnamed: 0,distance_of_shot,distance_of_shot_1
3007,39.0,39.0
21159,34.0,34.0
11513,36.0,12.4
13488,20.0,20.0
7021,20.0,20.0
4658,34.0,34.0
12625,32.0,32.0


In [9]:
#filling na values and finding average of the 2 cols
d2.distance_of_shot.fillna(d2.distance_of_shot_1, inplace=True)
d2.distance_of_shot_1.fillna(d2.distance_of_shot, inplace=True)
d2['distance_of_shot_avg']=d2.mean(axis=1)
d2.head()

Unnamed: 0,distance_of_shot,distance_of_shot_1,distance_of_shot_avg
0,38.0,38.0,38.0
1,35.0,35.0,35.0
2,36.0,54.4,45.2
3,42.0,42.0,42.0
4,20.0,20.0,20.0


In [10]:
#concatinating d1 and d2 
d1 = d1.drop(['distance_of_shot','distance_of_shot.1'],axis=1)
d3 = pd.concat([d1, d2], axis=1)
d3.sample(2)

Unnamed: 0,shot_id_number,match_event_id,location_x,location_y,remaining_min,power_of_shot,knockout_match,game_season,is_goal,area_of_shot,...,lat/lng,type_of_shot,type_of_combined_shot,match_id,remaining_min.1,power_of_shot.1,knockout_match.1,distance_of_shot,distance_of_shot_1,distance_of_shot_avg
7838,7839,384.0,98.0,171.0,,4.0,0.0,,,Right Side Center(RC),...,"42.982923, -71.446094",,shot - 3,20500168,10.0,100.36,108.608,39.0,39.0,39.0
22542,22543,48.0,4.0,144.0,7.0,1.0,0.0,2015-16,,Center(C),...,"42.982923, -71.446094",shot - 17,,21500818,7.0,1.0,0.0,34.0,34.0,34.0


In [11]:
#filling in the na values in distance of shot avg with the help of range of shot 
n1 = d3['distance_of_shot_avg'].isnull()
n1[n1==True].count()

97

In [12]:
r = d3.groupby('range_of_shot')['distance_of_shot_avg'].agg('mean')
r

range_of_shot
16-24 ft.          40.742291
24+ ft.            46.707029
8-16 ft.           34.827985
Back Court Shot    72.619210
Less Than 8 ft.    25.586323
Name: distance_of_shot_avg, dtype: float64

In [13]:
d3.distance_of_shot_avg.fillna(d3.range_of_shot, inplace=True)

In [14]:
for i in d3.index:
  val = d3.get_value(i,'distance_of_shot_avg')
  if(val=='Less Than 8 ft.'):
    d3.set_value(i,'distance_of_shot_avg',25.5863)
  elif(val=='16-24 ft.'):
    d3.set_value(i,'distance_of_shot_avg',40.74)
  elif(val=='24+ ft.'):
    d3.set_value(i,'distance_of_shot_avg',46.707029)
  elif(val=='8-16 ft.'):
    d3.set_value(i,'distance_of_shot_avg',34.8279)
  elif(val=='Back Court Shot'):
    d3.set_value(i,'distance_of_shot_avg',72.619)

In [15]:
d3.columns

Index(['shot_id_number', 'match_event_id', 'location_x', 'location_y',
       'remaining_min', 'power_of_shot', 'knockout_match', 'game_season',
       'is_goal', 'area_of_shot', 'shot_basics', 'range_of_shot',
       'date_of_game', 'home/away', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id', 'remaining_min.1',
       'power_of_shot.1', 'knockout_match.1', 'distance_of_shot',
       'distance_of_shot_1', 'distance_of_shot_avg'],
      dtype='object')

In [16]:
d3['distance_of_shot_avg'].fillna(36, inplace=True)

Remaining_min Analysis

In [17]:
#to see whether the values are same or not 
d4 = d1[['remaining_min','remaining_min.1']]
d4 = d4.rename(columns={'remaining_min.1':'remaining_min_1'})
d4.head()

Unnamed: 0,remaining_min,remaining_min_1
0,10.0,10.0
1,10.0,10.0
2,7.0,92.64
3,6.0,
4,,42.64


In [18]:
d4.describe()

Unnamed: 0,remaining_min,remaining_min_1
count,29135.0,29162.0
mean,4.883233,18.204615
std,3.452533,29.416973
min,0.0,0.0
25%,2.0,3.0
50%,5.0,6.0
75%,8.0,11.0
max,11.0,128.7616


In [19]:
#filling na values in remaining_min_1 with remaining_min 
d4.remaining_min_1.fillna(d4.remaining_min, inplace=True)
d4= d4.drop('remaining_min',axis=1)

In [20]:
#concatinating these the average now d5 is latest 
d5 = pd.concat([d3, d4], axis=1)
d5 = d5.drop(['remaining_min','remaining_min.1'],axis=1)
d5.columns

Index(['shot_id_number', 'match_event_id', 'location_x', 'location_y',
       'power_of_shot', 'knockout_match', 'game_season', 'is_goal',
       'area_of_shot', 'shot_basics', 'range_of_shot', 'date_of_game',
       'home/away', 'lat/lng', 'type_of_shot', 'type_of_combined_shot',
       'match_id', 'power_of_shot.1', 'knockout_match.1', 'distance_of_shot',
       'distance_of_shot_1', 'distance_of_shot_avg', 'remaining_min_1'],
      dtype='object')

In [21]:
d5['remaining_min_1'].describe()

count    30616.000000
mean        17.575091
std         28.857798
min          0.000000
25%          3.000000
50%          6.000000
75%         10.000000
max        128.761600
Name: remaining_min_1, dtype: float64

In [22]:
d5['remaining_min_1'].fillna(17,inplace=True)

knockout match Analysis

In [23]:
#selecting the appropriate column for knockout match 
d5 = d5.rename(columns={'knockout_match.1':'knockout_match_1'})
d5.knockout_match.fillna(d5.knockout_match_1, inplace=True)
d5 = d5.drop('knockout_match_1',axis=1)

In [24]:
d5['knockout_match'].fillna("U", inplace = True)

In [25]:
d5['knockout_match'] = d5['knockout_match'].replace(to_replace ='U', value = '0.0', regex = True) 
d5['knockout_match'] = pd.to_numeric(d5['knockout_match'])

for i in d5.index:
  val = d5.get_value(i,'knockout_match')
  if(val>1.0):
    d5.set_value(i,'knockout_match',1.0)

In [26]:
d5['knockout_match'].unique()

array([0., 1.])

power of shot Analysis

In [27]:
#power of shot has 1 to 7 makes more sense
d5 = d5.rename(columns={'power_of_shot.1':'power_of_shot_1'})
d5.power_of_shot.fillna(d5.power_of_shot_1, inplace=True)
d5 = d5.drop('power_of_shot_1',axis=1)

In [28]:
d5['power_of_shot'].describe()

count    30627.000000
mean         3.108937
std          6.761758
min          1.000000
25%          1.000000
50%          3.000000
75%          4.000000
max        118.360000
Name: power_of_shot, dtype: float64

In [29]:
d5.loc[d5['power_of_shot'] > 7, 'power_of_shot'] = np.nan

In [30]:
d5['power_of_shot'].unique()

array([ 1.,  2.,  3.,  4., nan,  5.,  6.,  7.])

In [31]:
d5['power_of_shot'].mode()

0    3.0
dtype: float64

In [32]:
d5['power_of_shot'].fillna(3, inplace = True)
d5['power_of_shot'].unique()

array([1., 2., 3., 4., 5., 6., 7.])

shot basics and area of shot Analysis

In [33]:
n2=d5['area_of_shot'].isnull()
n2[n2==True].count()

1502

In [34]:
#because the value of na is large shouldn't be replaced by mode
d5['area_of_shot'].fillna("U", inplace = True)
d5['shot_basics'].fillna("U", inplace = True)

In [35]:
d5 = pd.get_dummies(d5,prefix_sep="_",columns=["area_of_shot","shot_basics"])
d5.shape

(30697, 34)

In [36]:
d5.dtypes

shot_id_number                          int64
match_event_id                        float64
location_x                            float64
location_y                            float64
power_of_shot                         float64
knockout_match                        float64
game_season                            object
is_goal                               float64
range_of_shot                          object
date_of_game                           object
home/away                              object
lat/lng                                object
type_of_shot                           object
type_of_combined_shot                  object
match_id                                int64
distance_of_shot                      float64
distance_of_shot_1                    float64
distance_of_shot_avg                  float64
remaining_min_1                       float64
area_of_shot_Center(C)                  uint8
area_of_shot_Left Side Center(LC)       uint8
area_of_shot_Left Side(L)         

game_season Analysis

In [37]:
d5['game_season'].unique()
d5['game_season'].fillna(method='ffill', inplace=True)

type of shot Analysis

In [38]:
#filling 0 in place of na 
d5['type_of_combined_shot'].fillna(0, inplace=True)
d5['type_of_shot'].fillna(0, inplace = True)

In [39]:
d5.to_csv('first.csv',index=False)

location x and y Analysis

In [40]:
d5 = pd.read_csv('first.csv')
d6 = d5.drop('location_x',axis=1)
d6 = d6.drop('location_y',axis=1)

home/away and lat/lng analysis

In [41]:
d6['home/away'] = d6.groupby(['match_id'])['home/away'].ffill()
d6['home/away'] = d6.groupby(['match_id'])['home/away'].bfill()

In [42]:
d6['lat/lng'].fillna(d6['home/away'],inplace=True)
d6['lat/lng'] = d6['lat/lng'].replace(to_replace ='@', value = 0 , regex = True) 
d6['lat/lng'] = d6['lat/lng'].replace(to_replace ='vs', value = 1 , regex = True) 

In [43]:
d6.loc[(d6['lat/lng'] != '42.982923, -71.446094') & (d6['lat/lng'] != 1) &  (d6['lat/lng'].notnull()), 'lat/lng'] = 0
d6.loc[d6['lat/lng'] == '42.982923, -71.446094','lat/lng'] = 1
d6['lat/lng'].unique()

array([0, 1], dtype=object)

In [44]:
#to get name of opposition teams
d6['home/away']=d6['home/away'].astype("str").apply(lambda x: x.lstrip("MANU"))
d6['home/away']=d6['home/away'].astype("str").apply(lambda x: x.lstrip("vs. "))
d6['home/away']=d6['home/away'].astype("str").apply(lambda x: x.lstrip("@ "))

In [45]:
d6 = d6.drop(['range_of_shot','distance_of_shot','distance_of_shot_1'], axis=1)
d6 = d6.drop(['match_event_id','date_of_game'],axis=1)

In [46]:
#d7 = d6['is_goal']

In [47]:
#to check if we got rid of all na values
c= d6.isnull()
c[c==True].count()

shot_id_number                           0
power_of_shot                            0
knockout_match                           0
game_season                              0
is_goal                               6268
home/away                                0
lat/lng                                  0
type_of_shot                             0
type_of_combined_shot                    0
match_id                                 0
distance_of_shot_avg                     0
remaining_min_1                          0
area_of_shot_Center(C)                   0
area_of_shot_Left Side Center(LC)        0
area_of_shot_Left Side(L)                0
area_of_shot_Mid Ground(MG)              0
area_of_shot_Right Side Center(RC)       0
area_of_shot_Right Side(R)               0
area_of_shot_U                           0
shot_basics_Goal Area                    0
shot_basics_Goal Line                    0
shot_basics_Left Corner                  0
shot_basics_Mid Ground Line              0
shot_basics

In [48]:
d6['is_goal'].fillna(5, inplace=True)
d6['is_goal'] = pd.to_numeric(d6['is_goal'])

In [49]:
d6.to_csv('second.csv',index=False)

handling categorical data

In [50]:
f1 = pd.read_csv('second.csv')

In [51]:
f1.shape

(30697, 27)

In [52]:
f2 = f1.groupby(['home/away'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
f2 = pd.DataFrame(f2)
#f2.head()

In [53]:
f3 = f1.groupby(['match_id'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
f3 = pd.DataFrame(f3)
#f3.head()

In [54]:
#f4 = f1.groupby(['area_of_shot'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
#f4 = pd.DataFrame(f4)
#f4

In [55]:
#f5 = f1.groupby(['shot_basics'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
#f5 = pd.DataFrame(f5)
#f5

In [56]:
f6 = f1.groupby(['game_season'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
f6 = pd.DataFrame(f6)
#f6.head()

In [57]:
f7 = f1.groupby(['type_of_shot'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
f7 = pd.DataFrame(f7)
f7.head()

Unnamed: 0,type_of_shot,is_goal
0,0,21133.0
34,shot - 39,1932.0
35,shot - 4,1874.0
31,shot - 36,1576.0
33,shot - 38,1043.0


In [58]:
f8 = f1.groupby(['type_of_combined_shot'])['is_goal'].agg('sum').reset_index().sort_values(by='is_goal',ascending=False)
f8 = pd.DataFrame(f8)
f8.head()

Unnamed: 0,type_of_combined_shot,is_goal
0,0,21086.0
4,shot - 3,15399.0
5,shot - 4,4241.0
2,shot - 1,1131.0
1,shot - 0,130.0


replacing the string values with numeric values using merge

In [59]:
m1 = pd.merge(f1, f2, on ='home/away')
m1 = m1.rename(columns={'is_goal_y':'home/away_n'})
#m1.columns

In [60]:
m2 = pd.merge(m1, f3, on ='match_id')
m2 = m2.rename(columns={'is_goal':'match_id_n'})
#m2.columns

In [61]:
#m3 = pd.merge(m2, f4, on ='area_of_shot')
#m3 = m3.rename(columns={'is_goal':'area_of_shot_n'})
#m3.columns

In [62]:
#m4 = pd.merge(m3, f5, on ='shot_basics')
#m4 = m4.rename(columns={'is_goal':'shot_basics_n'})
#m4.columns

In [63]:
m5 = pd.merge(m2, f6, on ='game_season')
m5 = m5.rename(columns={'is_goal':'game_season_n'})
#m5.columns

In [64]:
m6 = pd.merge(m5, f7, on ='type_of_shot')
m6 = m6.rename(columns={'is_goal':'type_of_shot_n'})
#m6.columns

In [65]:
m7 = pd.merge(m6, f8, on ='type_of_combined_shot')
m7 = m7.rename(columns={'is_goal':'type_of_combined_shot_n'})
m7.columns

Index(['shot_id_number', 'power_of_shot', 'knockout_match', 'game_season',
       'is_goal_x', 'home/away', 'lat/lng', 'type_of_shot',
       'type_of_combined_shot', 'match_id', 'distance_of_shot_avg',
       'remaining_min_1', 'area_of_shot_Center(C)',
       'area_of_shot_Left Side Center(LC)', 'area_of_shot_Left Side(L)',
       'area_of_shot_Mid Ground(MG)', 'area_of_shot_Right Side Center(RC)',
       'area_of_shot_Right Side(R)', 'area_of_shot_U', 'shot_basics_Goal Area',
       'shot_basics_Goal Line', 'shot_basics_Left Corner',
       'shot_basics_Mid Ground Line', 'shot_basics_Mid Range',
       'shot_basics_Penalty Spot', 'shot_basics_Right Corner', 'shot_basics_U',
       'home/away_n', 'match_id_n', 'game_season_n', 'type_of_shot_n',
       'type_of_combined_shot_n'],
      dtype='object')

In [66]:
m8 = m7.drop(['home/away','match_id', 'game_season'],axis=1)
m8 = m8.drop(['type_of_shot','type_of_combined_shot'],axis=1)
m8.columns

Index(['shot_id_number', 'power_of_shot', 'knockout_match', 'is_goal_x',
       'lat/lng', 'distance_of_shot_avg', 'remaining_min_1',
       'area_of_shot_Center(C)', 'area_of_shot_Left Side Center(LC)',
       'area_of_shot_Left Side(L)', 'area_of_shot_Mid Ground(MG)',
       'area_of_shot_Right Side Center(RC)', 'area_of_shot_Right Side(R)',
       'area_of_shot_U', 'shot_basics_Goal Area', 'shot_basics_Goal Line',
       'shot_basics_Left Corner', 'shot_basics_Mid Ground Line',
       'shot_basics_Mid Range', 'shot_basics_Penalty Spot',
       'shot_basics_Right Corner', 'shot_basics_U', 'home/away_n',
       'match_id_n', 'game_season_n', 'type_of_shot_n',
       'type_of_combined_shot_n'],
      dtype='object')

In [67]:
m8 = m8.rename(columns={'lat/lng':'home/away', 'home/away_n':'opposing_team'})

In [68]:
m8['is_goal_x'].unique()

array([5., 0., 1.])

In [69]:
g=m8['is_goal_x']==5
g[g==True].count()

6268

In [84]:
m9= m8.loc[m8['is_goal_x'] == 5]
m9.to_csv('test_data.csv',index=False)

In [70]:
#m8.to_csv('data(2).csv',index=False)
#test.to_csv('test_data.csv',index=False)

In [85]:
m10= m8.loc[m8['is_goal_x'] != 5]
m10.to_csv('train_data.csv',index=False)

machine learning model

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [72]:
result_dict = {}

In [73]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc, 
            'precision': prec,
            'recall':recall, 
            'accuracy_count':num_acc}

In [74]:
def build_model(classifier_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    print(pred_results.sample(10))
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary, 
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [75]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

In [76]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [77]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [78]:
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [79]:
def sgd_fn(x_train, y_train, max_iter=1000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
     
    return model

In [80]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train) 
    
    return model

In [81]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None): 
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [82]:
def naive_bayes_fn(x_train,y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [86]:
a1 = pd.read_csv("train_data.csv")
a2 = pd.read_csv("test_data.csv")
a2.columns
col = ['shot_id_number', 'power_of_shot', 'knockout_match', 'is_goal_x',
       'home/away', 'distance_of_shot_avg', 'remaining_min_1',
       'area_of_shot_Center(C)', 'area_of_shot_Left Side Center(LC)',
       'area_of_shot_Left Side(L)', 'area_of_shot_Mid Ground(MG)',
       'area_of_shot_Right Side Center(RC)', 'area_of_shot_Right Side(R)',
       'area_of_shot_U', 'shot_basics_Goal Area', 'shot_basics_Goal Line',
       'shot_basics_Left Corner', 'shot_basics_Mid Ground Line',
       'shot_basics_Mid Range', 'shot_basics_Penalty Spot',
       'shot_basics_Right Corner', 'shot_basics_U', 'opposing_team',
       'match_id_n', 'game_season_n', 'type_of_shot_n',
       'type_of_combined_shot_n']
for i in col:
    a2[i] = pd.to_numeric(a2[i])

new_x = a2.drop('is_goal_x',axis=1)

In [87]:
new_x.head()

Unnamed: 0,shot_id_number,power_of_shot,knockout_match,home/away,distance_of_shot_avg,remaining_min_1,area_of_shot_Center(C),area_of_shot_Left Side Center(LC),area_of_shot_Left Side(L),area_of_shot_Mid Ground(MG),...,shot_basics_Mid Ground Line,shot_basics_Mid Range,shot_basics_Penalty Spot,shot_basics_Right Corner,shot_basics_U,opposing_team,match_id_n,game_season_n,type_of_shot_n,type_of_combined_shot_n
0,1,1.0,0.0,0,38.0,10.0,0,0,0,0,...,0,1,0,0,0,2133.0,14.0,2543.0,223.0,21086.0
1,26367,3.0,1.0,0,26.0,25.64,1,0,0,0,...,0,0,0,0,0,2777.0,45.0,2543.0,223.0,21086.0
2,26313,4.0,1.0,0,36.2,11.0,0,0,0,0,...,0,0,0,0,1,2182.0,29.0,2543.0,223.0,21086.0
3,417,4.0,0.0,1,39.0,5.0,1,0,0,0,...,0,1,0,0,0,969.0,32.0,2543.0,223.0,21086.0
4,26977,2.0,1.0,0,36.0,11.0,0,0,0,0,...,0,1,0,0,0,770.0,16.0,2711.0,223.0,21086.0


In [88]:
a1.columns

Index(['shot_id_number', 'power_of_shot', 'knockout_match', 'is_goal_x',
       'home/away', 'distance_of_shot_avg', 'remaining_min_1',
       'area_of_shot_Center(C)', 'area_of_shot_Left Side Center(LC)',
       'area_of_shot_Left Side(L)', 'area_of_shot_Mid Ground(MG)',
       'area_of_shot_Right Side Center(RC)', 'area_of_shot_Right Side(R)',
       'area_of_shot_U', 'shot_basics_Goal Area', 'shot_basics_Goal Line',
       'shot_basics_Left Corner', 'shot_basics_Mid Ground Line',
       'shot_basics_Mid Range', 'shot_basics_Penalty Spot',
       'shot_basics_Right Corner', 'shot_basics_U', 'opposing_team',
       'match_id_n', 'game_season_n', 'type_of_shot_n',
       'type_of_combined_shot_n'],
      dtype='object')

In [89]:
result_dict['survived ~ dt'] = build_model(decision_tree_fn,
                                              'is_goal_x',
                                               [ 'power_of_shot', 'knockout_match',
       'home/away', 'distance_of_shot_avg', 'remaining_min_1',
       'area_of_shot_Center(C)', 'area_of_shot_Left Side Center(LC)',
       'area_of_shot_Left Side(L)', 'area_of_shot_Mid Ground(MG)',
       'area_of_shot_Right Side Center(RC)', 'area_of_shot_Right Side(R)',
       'area_of_shot_U', 'shot_basics_Goal Area', 'shot_basics_Goal Line',
       'shot_basics_Left Corner', 'shot_basics_Mid Ground Line',
       'shot_basics_Mid Range', 'shot_basics_Penalty Spot',
       'shot_basics_Right Corner', 'shot_basics_U', 'opposing_team',
       'match_id_n', 'game_season_n', 'type_of_shot_n',
       'type_of_combined_shot_n'],
                                               a1)

compare_results()

       y_pred  y_test
15783     0.0     0.0
14371     1.0     1.0
11645     1.0     1.0
23311     1.0     1.0
8827      1.0     1.0
23765     1.0     0.0
18189     0.0     1.0
10904     0.0     0.0
11927     0.0     1.0
21051     1.0     0.0
Classification:  survived ~ dt

Training data
accuracy 0.9994371386174078
precision 1.0
recall 0.998735632183908
accuracy_count 19532

Test data
accuracy 0.5556692591076545
precision 0.5018001800180018
recall 0.5117026158788435
accuracy_count 2715



In [90]:
from sklearn.model_selection import train_test_split

X = a1.drop('is_goal_x', axis=1)
Y = a1['is_goal_x']



x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2565802939)
#x_test = a2.drop('is_goal_x',axis=1)

In [91]:
a1.shape

(24429, 27)

In [92]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_train,y_train)

y_pred=clf.predict(new_x)

  from numpy.core.umath_tests import inner1d


In [93]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5197830248883216


In [94]:
! pip install --user xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.90


You are using pip version 9.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [97]:
# First XGBoost model for Pima Indians dataset
import xgboost
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)
# make predictions for test data
y_pred = model.predict(new_x)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

ModuleNotFoundError: No module named 'xgboost'

In [None]:
pred_results = pd.DataFrame({'y_pred': y_pred})
#pred_results.to_csv('sub.csv',index=False)
pred_results