# xG Model

The idea is to be able to predict what will happen to particular chances and events, whether they will be converted or not, using particular attributes of data. This gives us an idea as to whether an event, if it occurs is likely to be a goal or not.

Data:
Data provision courtesy of Stratabet. Here, I've used English Championship, English Premiership, Bundesliga, France, Spain, Italy, division 1 data. It dates from the beginning of season 16-17 to the current 17-18.

Attributes:
For now, I've used attributes such as 'icon' (type of event), 'shotQuality' (used values defined by Stratabet), 'defPressure', 'numDefPlayers', 'numAttPlayers', 'chanceRating' (used values as defined by Stratabet), 'type' (defines passage of play). All attributes are encoded to particular values. The 'outcome' variable is binary encoded, ofcourse.

Although I've used the parameter chanceRating & shotQuality which covers the idea of a shot going in or not, I would also like to incorporate Shot location later on.

In [224]:
#####################################################################################################
# STEP 1: Loading in Data 

# Use data through Pandas and Numpy manipulation


In [356]:
engch16 = pd.read_csv('latestdatamarch/EngCh/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
engch17 = pd.read_csv('latestdatamarch/EngCh/chances_from_2017-07-01.csv')
engpr16 = pd.read_csv('latestdatamarch/EngPr/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
engpr17 = pd.read_csv('latestdatamarch/EngPr/chances_from_2017-07-01.csv')
bl16 = pd.read_csv('latestdatamarch/GerBL1/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
bl17 = pd.read_csv('latestdatamarch/GerBL1/chances_from_2017-07-01.csv')
ita16 = pd.read_csv('latestdatamarch/ItaSA/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
ita17 = pd.read_csv('latestdatamarch/ItaSA/chances_from_2017-07-01.csv')
fra16 = pd.read_csv('latestdatamarch/FraL1/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
fra17 = pd.read_csv('latestdatamarch/FraL1/chances_from_2017-07-01.csv')
spa16 = pd.read_csv('latestdatamarch/SpaPr/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
spa17 = pd.read_csv('latestdatamarch/SpaPr/chances_from_2017-07-01.csv')

In [357]:
engch16.shape

(12730, 27)

In [453]:
df = engch16.append(engch17).append(engpr16).append(bl16).append(bl17).append(ita16).append(ita17).append(fra16).append(fra17).append(spa16).append(spa17)
test = engpr17

In [454]:
df.shape

(84334, 27)

In [455]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84334 entries, 0 to 5845
Data columns (total 27 columns):
Unnamed: 0           84334 non-null int64
competition          84334 non-null object
gsm_id               84334 non-null int64
kickoffDate          84334 non-null object
kickoffTime          84334 non-null object
hometeam_team1       84334 non-null object
awayteam_team2       84334 non-null object
icon                 84334 non-null object
chanceRating         84334 non-null object
team                 84334 non-null object
type                 84334 non-null object
time                 84334 non-null object
player               84334 non-null object
location_x           84334 non-null object
location_y           84334 non-null object
bodyPart             84334 non-null object
shotQuality          83153 non-null object
defPressure          84334 non-null object
numDefPlayers        84334 non-null object
numAttPlayers        84334 non-null object
outcome              84334 non-nul

In [361]:
#####################################################################################################
# STEP 2: Clean Data 

# Remove missing values, treating noisy data


In [456]:
# import from numpy
import numpy as np
import pandas as pd

df.head()

Unnamed: 0.1,Unnamed: 0,competition,gsm_id,kickoffDate,kickoffTime,hometeam_team1,awayteam_team2,icon,chanceRating,team,...,defPressure,numDefPlayers,numAttPlayers,outcome,primaryPlayer,primaryType,primaryLocation_x,primaryLocation_y,secondaryPlayer,secondaryType
0,302,EngCh,2237445,2017-05-29,14:00:00,Huddersfield Town,Reading,goal,Penalty,Huddersfield Town,...,0,1,0,-,-,-,-,-,-,-
1,301,EngCh,2237445,2017-05-29,14:00:00,Huddersfield Town,Reading,goal,Penalty,Huddersfield Town,...,0,1,0,-,-,-,-,-,-,-
2,300,EngCh,2237445,2017-05-29,14:00:00,Huddersfield Town,Reading,goal,Penalty,Reading,...,0,1,0,-,-,-,-,-,-,-
3,299,EngCh,2237445,2017-05-29,14:00:00,Huddersfield Town,Reading,goal,Penalty,Reading,...,0,1,0,-,-,-,-,-,-,-
4,298,EngCh,2237445,2017-05-29,14:00:00,Huddersfield Town,Reading,goal,Penalty,Huddersfield Town,...,0,1,0,-,-,-,-,-,-,-


In [459]:
# Picking out best features to work on

In [460]:
df.shape

(84334, 15)

In [465]:
df = df[['icon', "bodyPart","location_x","location_y","shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome",'primaryType', 'primaryLocation_x', 'primaryLocation_y', 'secondaryType', 'chanceRating', 'type']] 
test = test[['icon', "bodyPart","location_x","location_y","shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome",'primaryType', 'primaryLocation_x', 'primaryLocation_y', 'secondaryType', 'chanceRating', 'type']] 
df.head()

Unnamed: 0,icon,bodyPart,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,primaryType,primaryLocation_x,primaryLocation_y,secondaryType,chanceRating,type
0,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
1,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
2,goal,Right,0.0,44.0,4,0,1,0,-,-,-,-,-,Penalty,Penalty
3,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
4,goal,Left,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty


In [466]:
df.shape

(84334, 15)

In [467]:
null_values = df.isnull().sum()
null_values

icon                    0
bodyPart                0
location_x              0
location_y              0
shotQuality          1181
defPressure             0
numDefPlayers           0
numAttPlayers           0
outcome                 0
primaryType             0
primaryLocation_x       0
primaryLocation_y       0
secondaryType           0
chanceRating            0
type                    0
dtype: int64

In [468]:
# df.loc[df['defPressure']  == '-'] + df.loc[['defPressure']  == 'NaN']
df['shotQuality'] = df['shotQuality'].replace('-', 0)
test['shotQuality'] = test['shotQuality'].replace('-',0)
#empty.head()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,icon,bodyPart,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,primaryType,primaryLocation_x,primaryLocation_y,secondaryType,chanceRating,type
0,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
1,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
2,goal,Right,0.0,44.0,4,0,1,0,-,-,-,-,-,Penalty,Penalty
3,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
4,goal,Left,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty


In [469]:
df.shotQuality = df.shotQuality.astype(int)
test.shotQuality = test.shotQuality.astype(int)

ValueError: cannot convert float NaN to integer

In [470]:
df.shotQuality.unique()

array(['3', '4', '2', '0', '1', 0, nan, '5'], dtype=object)

In [471]:
df.head()

Unnamed: 0,icon,bodyPart,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,primaryType,primaryLocation_x,primaryLocation_y,secondaryType,chanceRating,type
0,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
1,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
2,goal,Right,0.0,44.0,4,0,1,0,-,-,-,-,-,Penalty,Penalty
3,goal,Right,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty
4,goal,Left,0.0,44.0,3,0,1,0,-,-,-,-,-,Penalty,Penalty


In [473]:
df = df.dropna(subset = ['shotQuality']) # remove where shotQuality is NaN
test = test.dropna(subset = ['shotQuality'])
test.shotQuality.unique()

array(['3', '2', '0', '1', '4', '5', 0], dtype=object)

In [474]:
df.shotQuality.unique()

array(['3', '4', '2', '0', '1', 0, '5'], dtype=object)

In [475]:
df = df[df.icon != 'owngoal'] # removing own goals
test = test[test.icon != 'owngoal'] # removing own goals

In [476]:
df.shotQuality.unique()

array(['3', '4', '2', '0', '1', '5'], dtype=object)

In [477]:
df.primaryType.unique()

array(['-', 'Cross High', 'Free Kick', 'Cross Low', 'Open Play Pass',
       'Free Kick Won', 'Corner', 'Shot (Deflection)',
       'Shot (Opposition Rebound)', 'Turnover', 'Penalty Earned',
       'Throw in', 'Shot (Woodwork Rebound)', 'Dangerous Moment',
       'Corner Won'], dtype=object)

In [478]:
# further reductio of attributes

In [479]:
df = df[['icon', "location_x","location_y","shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome", 'chanceRating', 'type']] 
test = test[['icon', "location_x","location_y","shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome", 'chanceRating', 'type']]
df.head()

Unnamed: 0,icon,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
0,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
1,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
2,goal,0.0,44.0,4,0,1,0,-,Penalty,Penalty
3,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
4,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty


In [480]:
df[df.outcome != '-']
test[test.outcome != '-']

Unnamed: 0,icon,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
2,goodchance,37.0,50.0,2,3,4,0,Missed,goodchance,Open Play
3,fairlygoodchance,67.0,36.0,2,1,4,1,Missed,fairlygoodchance,Open Play
4,greatchance,2.0,38.0,0,3,1,0,Defended,greatchance,Dangerous Moment
5,fairlygoodchance,35.0,58.0,3,3,2,0,Saved,fairlygoodchance,Open Play
6,verygoodchance,-45.0,35.0,1,1,1,0,Missed,verygoodchance,Open Play
7,poorchance,63.0,51.0,2,4,4,0,Defended,poorchance,Open Play
8,greatchance,-11.0,18.0,2,3,3,0,Missed,greatchance,Open Play
9,fairlygoodchance,-34.0,56.0,2,1,3,1,Defended,fairlygoodchance,Open Play
10,fairlygoodchance,-31.0,73.0,2,1,3,1,Defended,fairlygoodchance,Open Play
11,fairlygoodchance,42.0,10.0,2,2,3,0,Missed,fairlygoodchance,Open Play


In [481]:
df.type.value_counts()

Open Play                    68992
Open play                     5723
Direct Free-Kick              3842
Dangerous Moment              2906
Penalty                        885
Penalty Earned                 240
Direct free kick               176
-                               51
Direct Corner                   29
Open Play Pass                   4
Cross Low                        3
Cross High                       3
Shot (Opposition Rebound)        2
Shot (Deflection)                2
Direct corner                    2
Turnover                         2
Corner                           1
Free Kick Won                    1
Name: type, dtype: int64

In [483]:
df = df[df.type != '-']
test = test[test.type != '-']

In [484]:
df.icon.value_counts()

poorchance          27093
fairlygoodchance    18961
goodchance          13031
goal                10314
verygoodchance       7992
greatchance          5017
penmissed             262
superbchance          143
Name: icon, dtype: int64

In [485]:
df.chanceRating.value_counts()

poorchance          27093
fairlygoodchance    18961
goodchance          13031
verygoodchance       7992
greatchance          5017
Great                3768
Very Good            1961
Good                 1064
Fairly Good           948
Superb                892
Penalty               886
Poor                  795
-                     262
superbchance          143
Name: chanceRating, dtype: int64

In [487]:
df[df.chanceRating == '-']

Unnamed: 0,icon,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
128,penmissed,-,-,3,-,-,-,Save,-,Penalty Earned
538,penmissed,-,-,2,-,-,-,Save,-,Penalty Earned
539,penmissed,-,-,3,-,-,-,Save,-,Penalty Earned
966,penmissed,-,-,3,-,-,-,Save,-,Penalty Earned
1271,penmissed,-,-,3,-,-,-,Miss,-,Penalty Earned
1403,penmissed,-,-,3,-,-,-,Save,-,Penalty Earned
1962,penmissed,-,-,3,-,-,-,Save,-,Penalty Earned
2788,penmissed,-,-,2,-,-,-,Save,-,Turnover
2789,penmissed,-,-,1,-,-,-,Miss,-,Penalty Earned
3311,penmissed,-,-,1,-,-,-,Miss,-,Penalty Earned


In [488]:
df.numAttPlayers.value_counts()

0    60377
1    16031
2     4505
3     1287
4      278
-      262
5       54
6       12
7        7
Name: numAttPlayers, dtype: int64

In [489]:
df.shape

(82813, 10)

In [582]:
#####################################################################################################
# STEP 3: Vectorize Data

# Encode data to particular values to eventually understand their importance


In [491]:
df.head()

Unnamed: 0,icon,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
0,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
1,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
2,goal,0.0,44.0,4,0,1,0,-,Penalty,Penalty
3,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
4,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty


In [492]:
df.columns

Index(['icon', 'location_x', 'location_y', 'shotQuality', 'defPressure',
       'numDefPlayers', 'numAttPlayers', 'outcome', 'chanceRating', 'type'],
      dtype='object')

In [493]:
df.defPressure.value_counts()

2    18241
1    18029
3    17853
0    14859
4    10447
5     3122
-      262
Name: defPressure, dtype: int64

In [494]:
df.head(20)

Unnamed: 0,icon,location_x,location_y,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
0,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
1,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
2,goal,0.0,44.0,4,0,1,0,-,Penalty,Penalty
3,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
4,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
5,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
6,goal,0.0,44.0,3,0,1,0,-,Penalty,Penalty
7,poorchance,54.0,44.0,2,5,3,0,Defended,poorchance,Open Play
8,poorchance,14.0,50.0,0,5,1,0,Defended,poorchance,Dangerous Moment
9,fairlygoodchance,28.0,11.0,0,3,1,0,Defended,fairlygoodchance,Dangerous Moment


In [499]:
df.chanceRating.unique()

array(['Penalty', 'poorchance', 'fairlygoodchance', 'verygoodchance',
       'goodchance', 'greatchance', 'superbchance', 'Good', 'Great',
       'Fairly Good', '-', 'Poor', 'Superb', 'Very Good'], dtype=object)

In [500]:
df.columns

Index(['icon', 'location_x', 'location_y', 'shotQuality', 'defPressure',
       'numDefPlayers', 'numAttPlayers', 'outcome', 'chanceRating', 'type',
       'cr'],
      dtype='object')

In [501]:
df.icon.unique()

array(['goal', 'poorchance', 'fairlygoodchance', 'verygoodchance',
       'goodchance', 'greatchance', 'superbchance', 'penmissed'],
      dtype=object)

In [502]:
cleanup_icon = {"icon": {"goal": 1, "superbchance": 0.83, "greatchance": 0.43, "verygoodchance": 0.22, "goodchance": 0.08, "fairlygoodchance": 0.05,  "poorchance": 0.02,  "penmissed": 0}}

In [503]:
df.replace(cleanup_icon, inplace=True)
test.replace(cleanup_icon, inplace=True)

In [504]:
df.icon.unique()

array([1.  , 0.02, 0.05, 0.22, 0.08, 0.43, 0.83, 0.  ])

In [505]:
df.icon.head(20)

0     1.00
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     0.02
8     0.02
9     0.05
10    0.22
11    0.05
12    0.08
13    0.05
14    0.43
15    0.83
16    0.02
17    0.02
18    0.02
19    0.02
Name: icon, dtype: float64

In [506]:
df.shotQuality.value_counts()

2    27231
1    25932
3    22175
4     3621
0     3350
5      504
Name: shotQuality, dtype: int64

In [507]:
df.defPressure.value_counts()

2    18241
1    18029
3    17853
0    14859
4    10447
5     3122
-      262
Name: defPressure, dtype: int64

In [508]:
df.numDefPlayers.value_counts()

2     28542
3     20655
1     15294
4      9206
5      4164
6      2171
7      1072
0       906
8       402
-       262
9       108
10       26
11        5
Name: numDefPlayers, dtype: int64

In [509]:
df.numAttPlayers.unique()

array(['0', '1', '2', '3', '-', '4', '5', '6', '7'], dtype=object)

In [510]:
df.outcome.unique()

array(['-', 'Defended', 'Missed', 'Saved', 'Woodwork', 'Save', 'Miss'],
      dtype=object)

In [511]:
df.outcome.value_counts()

Missed      32839
Saved       21767
Defended    15870
-           10314
Woodwork     1791
Save          195
Miss           37
Name: outcome, dtype: int64

In [512]:
df.shape

(82813, 11)

In [513]:
df = df[['icon', "shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome", 'chanceRating', 'type']] 
test = test[['icon', "shotQuality","defPressure","numDefPlayers","numAttPlayers","outcome", 'chanceRating', 'type']]

In [514]:
df.shape

(82813, 8)

In [515]:
df.outcome.value_counts()

Missed      32839
Saved       21767
Defended    15870
-           10314
Woodwork     1791
Save          195
Miss           37
Name: outcome, dtype: int64

In [516]:
cleanup_outcome = {"outcome" : { "-" : 1, "Missed":0, "Miss":0, "Save":0, "Woodwork":0, "Defended":0, "Saved":0}}


In [517]:
df.replace( cleanup_outcome , inplace = True )
test.replace(cleanup_outcome, inplace= True)

In [518]:
df.columns

Index(['icon', 'shotQuality', 'defPressure', 'numDefPlayers', 'numAttPlayers',
       'outcome', 'chanceRating', 'type'],
      dtype='object')

In [519]:
df.chanceRating.value_counts()

poorchance          27093
fairlygoodchance    18961
goodchance          13031
verygoodchance       7992
greatchance          5017
Great                3768
Very Good            1961
Good                 1064
Fairly Good           948
Superb                892
Penalty               886
Poor                  795
-                     262
superbchance          143
Name: chanceRating, dtype: int64

In [520]:
cleanup_chance = {"chanceRating": {"Penalty": 1, "Superb":0.83, "superbchance": 0.83, 
                                   "greatchance": 0.43, "Great":0.43,
                                   "verygoodchance": 0.22, "Very Good":0.22, 
                                   "Good":0.08 , "goodchance": 0.08, 
                                   "fairlygoodchance": 0.05,  "Fairly Good":0.05,
                                   "Poor": 0.02, "poorchance": 0.02, "-": 0
                                  }}

In [521]:
df.replace(cleanup_chance, inplace = True)
test.replace(cleanup_chance, inplace= True)

In [522]:
df.chanceRating.unique()

array([1.  , 0.02, 0.05, 0.22, 0.08, 0.43, 0.83, 0.  ])

In [523]:
df.chanceRating.value_counts()

0.02    27888
0.05    19909
0.08    14095
0.22     9953
0.43     8785
0.83     1035
1.00      886
0.00      262
Name: chanceRating, dtype: int64

In [527]:
df.type.unique()

array(['Penalty', 'Open Play', 'Dangerous Moment', 'Direct Free-Kick',
       'Open play', 'Penalty Earned', 'Direct free kick', 'Turnover',
       'Direct Corner', 'Shot (Opposition Rebound)', 'Cross High',
       'Direct corner', 'Open Play Pass', 'Cross Low',
       'Shot (Deflection)', 'Corner', 'Free Kick Won'], dtype=object)

In [528]:
df.head()

Unnamed: 0,icon,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
0,1.0,3,0,1,0,1,1.0,Penalty
1,1.0,3,0,1,0,1,1.0,Penalty
2,1.0,4,0,1,0,1,1.0,Penalty
3,1.0,3,0,1,0,1,1.0,Penalty
4,1.0,3,0,1,0,1,1.0,Penalty


In [529]:
df.type.unique()

array(['Penalty', 'Open Play', 'Dangerous Moment', 'Direct Free-Kick',
       'Open play', 'Penalty Earned', 'Direct free kick', 'Turnover',
       'Direct Corner', 'Shot (Opposition Rebound)', 'Cross High',
       'Direct corner', 'Open Play Pass', 'Cross Low',
       'Shot (Deflection)', 'Corner', 'Free Kick Won'], dtype=object)

In [530]:
df['type'] = df['type'].astype('category')
test['type'] = test['type'].astype('category')
df.dtypes

icon              float64
shotQuality        object
defPressure        object
numDefPlayers      object
numAttPlayers      object
outcome             int64
chanceRating      float64
type             category
dtype: object

In [531]:
df['type'] = df['type'].cat.codes
test['type'] = test['type'].cat.codes
df.type.unique()

array([12,  9,  3,  5, 11, 13,  7, 16,  4, 15,  1,  6, 10,  2, 14,  0,  8])

In [532]:
df.head()

Unnamed: 0,icon,shotQuality,defPressure,numDefPlayers,numAttPlayers,outcome,chanceRating,type
0,1.0,3,0,1,0,1,1.0,12
1,1.0,3,0,1,0,1,1.0,12
2,1.0,4,0,1,0,1,1.0,12
3,1.0,3,0,1,0,1,1.0,12
4,1.0,3,0,1,0,1,1.0,12


In [533]:
df.to_csv('file_name.csv', sep=',')
test.to_csv('test.csv', sep=',')

In [534]:
from numpy import genfromtxt
my_data = genfromtxt('file_name.csv', delimiter=',')
testdata = genfromtxt('test.csv', delimiter=',')

In [536]:
my_data

array([[      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [0.000e+00, 1.000e+00, 3.000e+00, ..., 1.000e+00, 1.000e+00,
        1.200e+01],
       [1.000e+00, 1.000e+00, 3.000e+00, ..., 1.000e+00, 1.000e+00,
        1.200e+01],
       ...,
       [5.843e+03, 1.000e+00, 3.000e+00, ..., 1.000e+00, 8.300e-01,
        9.000e+00],
       [5.844e+03, 1.000e+00, 4.000e+00, ..., 1.000e+00, 5.000e-02,
        9.000e+00],
       [5.845e+03, 0.000e+00, 3.000e+00, ..., 0.000e+00, 0.000e+00,
        1.300e+01]])

In [537]:
testdata

array([[      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [0.000e+00, 1.000e+00, 3.000e+00, ..., 1.000e+00, 4.300e-01,
        4.000e+00],
       [1.000e+00, 1.000e+00, 3.000e+00, ..., 1.000e+00, 4.300e-01,
        4.000e+00],
       ...,
       [6.724e+03, 1.000e+00, 3.000e+00, ..., 1.000e+00, 2.200e-01,
        4.000e+00],
       [6.725e+03, 1.000e+00, 4.000e+00, ..., 1.000e+00, 8.000e-02,
        4.000e+00],
       [6.726e+03, 1.000e+00, 4.000e+00, ..., 1.000e+00, 8.000e-02,
        4.000e+00]])

In [542]:
df2 = df.outcome
testdata = test.outcome

In [543]:
target = df2.values
testtarget = testdata
target

array([1, 1, 1, ..., 1, 1, 0])

In [544]:
df.columns

Index(['icon', 'shotQuality', 'defPressure', 'numDefPlayers', 'numAttPlayers',
       'outcome', 'chanceRating', 'type'],
      dtype='object')

In [545]:
df1 = df[['icon', 'shotQuality', 'defPressure', 'numDefPlayers', 'numAttPlayers',
       'chanceRating', 'type']]
test1 = test[['icon', 'shotQuality', 'defPressure', 'numDefPlayers', 'numAttPlayers',
       'chanceRating', 'type']]

In [546]:
df1.chanceRating.unique()

array([1.  , 0.02, 0.05, 0.22, 0.08, 0.43, 0.83, 0.  ])

In [547]:
df1['numDefPlayers'] = df1['numDefPlayers'].replace('-', '0')
test1['numDefPlayers'] = test1['numDefPlayers'].replace('-', '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [548]:
df1['numAttPlayers'] = df1['numAttPlayers'].replace('-', '0')
test1['numAttPlayers'] = test1['numAttPlayers'].replace('-', '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [549]:
df1['defPressure'] = df1['defPressure'].replace('-', '0')
test1['defPressure'] = test1['defPressure'].replace('-', '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [550]:
df1.columns

Index(['icon', 'shotQuality', 'defPressure', 'numDefPlayers', 'numAttPlayers',
       'chanceRating', 'type'],
      dtype='object')

In [551]:
df1.numAttPlayers.value_counts()

0    60639
1    16031
2     4505
3     1287
4      278
5       54
6       12
7        7
Name: numAttPlayers, dtype: int64

In [583]:
#####################################################################################################
# STEP 4: Create target, data, feature names as numpy array

# Getting data ready to apply machine learning algorithms


In [554]:
test = test1.values
test

array([[1.0, '3', '2', ..., '0', 0.43, 4],
       [1.0, '3', '3', ..., '0', 0.43, 4],
       [0.08, '2', '3', ..., '0', 0.08, 4],
       ...,
       [1.0, '3', '4', ..., '0', 0.22, 4],
       [1.0, '4', '4', ..., '1', 0.08, 4],
       [1.0, '4', '0', ..., '0', 0.08, 4]], dtype=object)

In [555]:
data = df1.values
data

array([[1.0, '3', '0', ..., '0', 1.0, 12],
       [1.0, '3', '0', ..., '0', 1.0, 12],
       [1.0, '4', '0', ..., '0', 1.0, 12],
       ...,
       [1.0, '3', '0', ..., '0', 0.83, 9],
       [1.0, '4', '2', ..., '1', 0.05, 9],
       [0.0, '3', '0', ..., '0', 0.0, 13]], dtype=object)

In [556]:
features = df1.columns.values
features

array(['icon', 'shotQuality', 'defPressure', 'numDefPlayers',
       'numAttPlayers', 'chanceRating', 'type'], dtype=object)

In [585]:
#####################################################################################################
# STEP 5: Machine Learning for xG

# Using lasso and RF for now, validate using AUC curve score


In [579]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, ShuffleSplit

  
scaler = StandardScaler()
X = scaler.fit_transform(data)
Y = target
names = features
  
lasso = Lasso(alpha=.3)
lasso.fit(X, Y)
 
    
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
rf.fit(X,Y)
scores = []
for i in range(X.shape[1]):
     score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
     scores.append((round(np.mean(score), 3), names[i]))
print(sorted(scores, reverse=True))

#A helper method for pretty-printing linear models
def pretty_print_linear(coefs, names = None, sort = False):
    if names.any == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

#print(scores)
xg = lasso.predict(test)
#print("Lasso model: ") 
#pretty_print_linear(lasso.coef_, names, sort = True)



[(1.0, 'icon'), (0.637, 'type'), (0.304, 'shotQuality'), (0.28, 'chanceRating'), (0.096, 'numDefPlayers'), (0.013, 'numAttPlayers'), (0.013, 'defPressure')]


In [580]:
from sklearn.metrics import roc_auc_score
roc_auc_score(testtarget, xg)

1.0

In [581]:
##################################################################### END OF XG MODEL, FOR NOW :P ##############################################################

In [451]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)


print('Fitting')
# Logistic Regression
log = LogisticRegression()
#inv_log = LogisticRegression()

log.fit(X_train, y_train)
#inv_log.fit(inv_X_train, y_train)

print('Predicting')
# Prediction
log_prediction = log.predict_proba(X_test)[:, 1]
exp_prediction = X_test.apply(lambda row: exp_xG(row), axis = 1)
#inv_log_prediction = inv_log.predict_proba(inv_X_test)[:, 1]

# RMSE
log_RMSE = m.sqrt(mean_squared_error(y_true = y_test, y_pred = log_prediction.clip(min = 0, max = 1)))
exp_RMSE = m.sqrt(mean_squared_error(y_true = y_test, y_pred = exp_prediction))
#inv_log_RMSE = m.sqrt(mean_squared_error(y_true = y_test, y_pred = inv_log_prediction.clip(min = 0, max = 1)))


print('RMSE Values:')
print('Logistic: ' + str(log_RMSE))
print('Exponential: ' + str(exp_RMSE))
#print('inv_Logistic: ' + str(inv_log_RMSE))

# ROC
log_fpr, log_tpr, log_thresh = roc_curve(y_test, log_prediction)
exp_fpr, exp_tpr, exp_thresh = roc_curve(y_test, exp_prediction)
#inv_log_fpr, inv_log_tpr, inv_log_thresh = roc_curve(y_test, inv_log_prediction)

log_auc = roc_auc_score(y_test, log_prediction)
exp_auc = roc_auc_score(y_test, exp_prediction)
#inv_log_auc = roc_auc_score(y_test, inv_log_prediction)


# Plot ROC curves
fig = plt.figure()
plt.scatter(log_fpr, log_tpr, c = 'b', s = 10, label = 'Logistic')
plt.scatter(exp_fpr, exp_tpr, c = 'r', s = 10, label = 'Exponential')
#plt.scatter(inv_log_fpr, inv_log_tpr, c = 'g', s = 10, label = 'inv_Logistic')

plt.text(0.5, 0.2, 'Log AUC = %.3f\nExp AUC = %.3f'
         % (log_auc, exp_auc) )
plt.legend()



ValueError: Number of labels=62612 does not match number of samples=1

In [None]:
from sklearn.datasets import load_boston
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from minepy import MINE
 
np.random.seed(0)
 
size = 750
X = np.random.uniform(0, 1, (size, 14))
 
#"Friedamn #1” regression problem
Y = (10 * np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2] - .5)**2 +
     10*X[:,3] + 5*X[:,4] + np.random.normal(0,1))
#Add 3 additional correlated variables (correlated with X1-X3)
X[:,10:] = X[:,:4] + np.random.normal(0, .025, (size,4))
 
names = ["x%s" % i for i in range(1,15)]
 
ranks = {}
 
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))
 
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)
 
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
 
 
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
 
 
rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
 
#stop the search when 5 features are left (they will get equal scores)
#rfe = RFE(lr, n_features_to_select=5)
#rfe.fit(X,Y)
#ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)
 
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
 
 
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)
 
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)
 
ranks["MIC"] = rank_to_dict(mic_scores, names) 
 
 
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
 
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
 
print ("\t%s" % "\t".join(methods))
for name in names:
    print ("%s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn.cross_validation import cross_val_score, ShuffleSplit

  
boston = load_boston()
scaler = StandardScaler()
X = scaler.fit_transform(boston["data"])
Y = boston["target"]
names = boston["feature_names"]
  
lasso = Lasso(alpha=.3)
lasso.fit(X, Y)
 
    
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
     score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
     scores.append((round(np.mean(score), 3), names[i]))
print(sorted(scores, reverse=True))

#A helper method for pretty-printing linear models
def pretty_print_linear(coefs, names = None, sort = False):
    if names.any == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)
print("Lasso model: ") 
pretty_print_linear(lasso.coef_, names, sort = True)

In [95]:
engch16 = pd.read_csv('latestdatamarch/EngCh/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
engch17 = pd.read_csv('latestdatamarch/EngCh/chances_from_2017-07-01.csv')
engpr16 = pd.read_csv('latestdatamarch/EngPr/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
engpr17 = pd.read_csv('latestdatamarch/EngPr/chances_from_2017-07-01.csv')
bl16 = pd.read_csv('latestdatamarch/GerBL1/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
bl17 = pd.read_csv('latestdatamarch/GerBL1/chances_from_2017-07-01.csv')
ita16 = pd.read_csv('latestdatamarch/ItaSA/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
ita17 = pd.read_csv('latestdatamarch/ItaSA/chances_from_2017-07-01.csv')
fra16 = pd.read_csv('latestdatamarch/FraL1/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
fra17 = pd.read_csv('latestdatamarch/FraL1/chances_from_2017-07-01.csv')
spa16 = pd.read_csv('latestdatamarch/SpaPr/2016-17/2017-06-27_chances_2016-07-01_2017-06-15.csv')
spa17 = pd.read_csv('latestdatamarch/SpaPr/chances_from_2017-07-01.csv')

In [129]:
engch16.type.value_counts()

Open Play                    10192
Open play                     1263
Direct Free-Kick               529
Dangerous Moment               335
Penalty                        121
Tackle                          95
-                               57
Penalty Earned                  40
Direct free kick                32
Holding                         23
Handball                        21
Push                            10
Blocking                         5
Dangerous Play                   2
Serious Foul Play                1
Shot (Opposition Rebound)        1
Cross High                       1
Direct Corner                    1
Turnover                         1
Name: type, dtype: int64

In [133]:
l = engch16.loc[engch16['icon'] == 'goal']
l.type.value_counts()

0          Penalty
1          Penalty
2          Penalty
3          Penalty
4          Penalty
5          Penalty
6          Penalty
57         Penalty
58         Penalty
59         Penalty
60         Penalty
61         Penalty
62         Penalty
63         Penalty
64       Open play
70         Penalty
126      Open play
127      Open play
135        Penalty
136      Open play
137      Open play
138      Open play
139        Penalty
140      Open play
141      Open play
142      Open play
143      Open play
144      Open play
145      Open play
146      Open play
           ...    
12527    Open play
12528    Open play
12529    Open play
12530    Open play
12531    Open play
12532    Open play
12533    Open play
12534    Open play
12535    Open play
12536    Open play
12537    Open play
12538      Penalty
12539    Open play
12540    Open play
12541    Open play
12542    Open play
12543    Open play
12544    Open play
12545    Open play
12546    Open play
12547    Open play
12548    Ope

In [127]:
engch17.type.value_counts()

Open Play           8493
Direct Free-Kick     376
Dangerous Moment     358
Penalty               68
[Tackle]              44
-                     27
[Holding]             18
Penalty Earned        16
[Push]                10
[Handball]             6
[Blocking]             5
Direct Corner          4
[Dangerous Play]       1
Name: type, dtype: int64

In [126]:
l = engch17.loc[engch17['icon'] == 'goal']
l.type.value_counts()

Open Play           974
Penalty              68
Direct Free-Kick     29
Name: type, dtype: int64

In [100]:
engpr16.type.value_counts()

Open Play           6455
Open play            920
Direct Free-Kick     360
Dangerous Moment     207
Penalty               81
Tackle                70
-                     41
Direct free kick      27
Penalty Earned        18
Handball              18
Holding               12
Push                   4
Dangerous Play         2
Direct corner          1
Direct Corner          1
Open Play Pass         1
Name: type, dtype: int64

In [101]:
l = engpr16.loc[engpr16['icon'] == 'goal']
l.type.value_counts()

Open play           920
Penalty              81
Direct free kick     27
Direct corner         1
Name: type, dtype: int64

In [102]:
engpr17.type.value_counts()

Open Play           6046
Dangerous Moment     277
Direct Free-Kick     240
Penalty               47
[Tackle]              46
-                     27
Penalty Earned        17
[Push]                 8
[Holding]              6
[Handball]             4
Direct Corner          4
[Blocking]             2
Cross High             1
[Dangerous Play]       1
Open Play Pass         1
Name: type, dtype: int64

In [103]:
l = engpr17.loc[engpr17['icon'] == 'goal']
l.type.value_counts()

Open Play           725
Penalty              47
Direct Free-Kick     11
Direct Corner         1
Name: type, dtype: int64

In [104]:
bl16.type.value_counts()

Open Play            4540
Open play             763
Direct Free-Kick      298
Dangerous Moment      245
Penalty                72
Tackle                 63
Direct free kick       24
-                      20
Penalty Earned         18
Holding                14
Push                    9
Handball                8
Dangerous Play          3
Direct Corner           2
Open Play Pass          2
Shot (Deflection)       1
Cross Low               1
Blocking                1
Turnover                1
Direct corner           1
Name: type, dtype: int64

In [105]:
l = bl16.loc[bl16['icon'] == 'goal']
l.type.value_counts()

Open play           763
Penalty              72
Direct free kick     24
Direct corner         1
Name: type, dtype: int64

In [106]:
bl17.type.value_counts()

Open Play           4653
Dangerous Moment     224
Direct Free-Kick     210
Penalty               58
[Tackle]              39
-                     17
Penalty Earned        16
[Holding]             11
[Handball]            10
[Push]                10
[Blocking]             4
[Dangerous Play]       2
Direct Corner          1
Name: type, dtype: int64

In [107]:
l = bl17.loc[bl17['icon'] == 'goal']
l.type.value_counts()

Open Play           540
Penalty              58
Direct Free-Kick     14
Name: type, dtype: int64

In [108]:
ita16.type.value_counts()

Open Play                    6482
Open play                     966
Direct Free-Kick              348
Dangerous Moment              322
Penalty                        98
Tackle                         78
Penalty Earned                 36
Direct free kick               31
-                              30
Handball                       22
Push                           18
Holding                        14
Direct Corner                   5
Dangerous Play                  2
Blocking                        2
Shot (Deflection)               1
Shot (Opposition Rebound)       1
Name: type, dtype: int64

In [109]:
l = ita16.loc[ita16['icon'] == 'goal']
l.type.value_counts()

Open play           966
Penalty              98
Direct free kick     31
Name: type, dtype: int64

In [110]:
ita17.type.value_counts()

Open Play              5659
Direct Free-Kick        260
Dangerous Moment        255
Penalty                  66
[Tackle]                 54
-                        30
[Handball]               21
Penalty Earned           20
[Holding]                 9
[Push]                    7
Direct Corner             5
Cross Low                 2
[Blocking]                1
[Serious Foul Play]       1
Cross High                1
Name: type, dtype: int64

In [111]:
l = ita17.loc[ita17['icon'] == 'goal']
l.type.value_counts()

Open Play           610
Penalty              66
Direct Free-Kick     25
Direct Corner         2
Name: type, dtype: int64

In [112]:
spa16.type.value_counts()

Open Play           5859
Open play            971
Direct Free-Kick     399
Dangerous Moment     357
Penalty               88
Tackle                58
Direct free kick      34
-                     34
Holding               23
Penalty Earned        20
Handball              17
Push                  14
Dangerous Play         7
Direct Corner          3
Cross High             1
Open Play Pass         1
Blocking               1
Name: type, dtype: int64

In [113]:
l = spa16.loc[spa16['icon'] == 'goal']
l.type.value_counts()

Open play           971
Penalty              88
Direct free kick     34
Name: type, dtype: int64

In [114]:
spa17.type.value_counts()

Open Play              5118
Direct Free-Kick        305
Dangerous Moment        232
Penalty                  60
[Tackle]                 43
-                        26
Penalty Earned           19
[Holding]                15
[Handball]               13
[Push]                   10
[Dangerous Play]          2
[Blocking]                1
[Serious Foul Play]       1
Direct Corner             1
Name: type, dtype: int64

In [115]:
l = spa17.loc[spa17['icon'] == 'goal']
l.type.value_counts()

Open Play           652
Penalty              60
Direct Free-Kick     17
Name: type, dtype: int64

In [116]:
fra16.type.value_counts()

Open Play            6313
Open play             840
Direct Free-Kick      378
Dangerous Moment      227
Penalty                97
Tackle                 82
-                      31
Direct free kick       28
Penalty Earned         21
Holding                16
Handball               14
Push                    7
Blocking                3
Dangerous Play          2
Serious Foul Play       1
Corner                  1
Name: type, dtype: int64

In [118]:
l = fra16.loc[fra16['icon'] == 'goal']
l.type.value_counts()

Open play           840
Penalty              97
Direct free kick     28
Name: type, dtype: int64

In [119]:
fra17.type.value_counts()

Open Play           5228
Direct Free-Kick     379
Dangerous Moment     144
Penalty               76
[Tackle]              56
-                     28
Penalty Earned        16
[Holding]             15
[Handball]            14
[Push]                11
Direct Corner          6
[Blocking]             2
[Dangerous Play]       2
Free Kick Won          1
Name: type, dtype: int64

In [120]:
l = fra17.loc[fra17['icon'] == 'goal']
l.type.value_counts()

Open Play           644
Penalty              76
Direct Free-Kick     20
Direct Corner         1
Name: type, dtype: int64

In [131]:
open_play_goal = 644 + 840 + 652 + 971 + 610 + 966+ 540 + 763 +  725 + 920 + 974 + 1263
open_play = 5228 + 6313 + 5118 + 5659 + 5859 + 6482 + 4653 + 4540 +  6046 + 6455 + 8493 + 10192
open_play_goal / open_play

0.13150670327034303

In [None]:
pen_goal = 121 + 68 + 81 + 47 + 72 + 58 + 98 + 66 + 88 + 60 + 97 + 76
pen = 121 + 68 + 81 + 47 + 72 + 58 + 98 + 66 + 88 + 60 + 97 + 76

In [None]:
import math
import random
import sys

A1 = [0, 0, 1, 1, 0, 0, 0, 
      0, 0, 0, 1, 0, 0, 0, 
      0, 0, 0, 1, 0, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      1, 1, 1, 0, 1, 1, 1]

B1 = [1, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 0]

C1 = [0, 0, 1, 1, 1, 1, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 0, 1, 1, 1, 1, 0]

D1 = [1, 1, 1, 1, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 1, 0, 
      1, 1, 1, 1, 1, 0, 0]

E1 = [1, 1, 1, 1, 1, 1, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 1, 1, 0, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 0, 0, 0, 0, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 1]

J1 = [0, 0, 0, 1, 1, 1, 1, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 0, 1, 1, 1, 0, 0]

K1 = [1, 1, 1, 0, 0, 1, 1, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 1, 0, 0, 0, 0, 
      0, 1, 1, 0, 0, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      1, 1, 1, 0, 0, 1, 1]

A2 = [0, 0, 0, 1, 0, 0, 0, 
      0, 0, 0, 1, 0, 0, 0, 
      0, 0, 0, 1, 0, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0]

B2 = [1, 1, 1, 1, 1, 1, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 0]

C2 = [0, 0, 1, 1, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 0, 1, 1, 1, 0, 0]

D2 = [1, 1, 1, 1, 1, 0, 0, 
      1, 0, 0, 0, 0, 1, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 1, 0, 
      1, 1, 1, 1, 1, 0, 0]

E2 = [1, 1, 1, 1, 1, 1, 1, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 1, 1, 1, 1, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 1, 1, 1, 1, 1, 1]

J2 = [0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 0, 1, 1, 1, 0, 0]

K2 = [1, 0, 0, 0, 0, 1, 0, 
      1, 0, 0, 0, 1, 0, 0, 
      1, 0, 0, 1, 0, 0, 0, 
      1, 0, 1, 0, 0, 0, 0, 
      1, 1, 0, 0, 0, 0, 0, 
      1, 0, 1, 0, 0, 0, 0, 
      1, 0, 0, 1, 0, 0, 0, 
      1, 0, 0, 0, 1, 0, 0, 
      1, 0, 0, 0, 0, 1, 0]

A3 = [0, 0, 0, 1, 0, 0, 0, 
      0, 0, 0, 1, 0, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 0, 1, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 1, 1, 1, 1, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 1, 0, 0, 0, 1, 1]

B3 = [1, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 1, 1, 1, 1, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 0]

C3 = [0, 0, 1, 1, 1, 0, 1, 
      0, 1, 0, 0, 0, 1, 1, 
      1, 0, 0, 0, 0, 0, 1, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 0, 
      1, 0, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 0, 1, 1, 1, 0, 0]

D3 = [1, 1, 1, 1, 0, 0, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      1, 1, 1, 1, 0, 0, 0]

E3 = [1, 1, 1, 1, 1, 1, 1, 
      0, 1, 0, 0, 0, 0, 1, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 1, 1, 1, 0, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 0, 0, 
      0, 1, 0, 0, 0, 0, 0, 
      0, 1, 0, 0, 0, 0, 1, 
      1, 1, 1, 1, 1, 1, 1]

J3 = [0, 0, 0, 0, 1, 1, 1, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 0, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 0, 1, 1, 1, 0, 0]

K3 = [1, 1, 1, 0, 0, 1, 1, 
      0, 1, 0, 0, 0, 1, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 1, 0, 0, 0, 0, 
      0, 1, 0, 1, 0, 0, 0, 
      0, 1, 0, 0, 1, 0, 0, 
      0, 1, 0, 0, 0, 1, 0, 
      1, 1, 1, 0, 0, 1, 1]

NAMES = ["A1", "B1", "C1", "D1", "E1", "J1", "K1", "A2", "B2", "C2", "D2", "E2", "J2", "K2", "A3", "B3", "C3", "D3", "E3", "J3", "K3"]

MAX_CLUSTERS = 25
INPUT_PATTERNS = 21
VEC_LEN = 63
VEC_XLEN = 5
VEC_YLEN = 5
DECAY_RATE = 0.96 # About 100 iterations.
MIN_ALPHA = 0.01
RADIUS_REDUCTION_POINT = 0.023 # Last 20% of iterations.

class SOM_Class3:
    def __init__(self, vectorLength, maxClusters, numPatterns, xLength, yLength, minimumAlpha, decayRate, reductionPoint, patternArray, namesArray):
        self.mVectorLen = vectorLength
        self.mMaxClusters = maxClusters
        self.mNumPatterns = numPatterns
        self.mXLength = xLength
        self.mYLength = yLength
        self.mMinAlpha = minimumAlpha
        self.mDecayRate = decayRate
        self.mReductionPoint = reductionPoint
        self.mAlpha = 0.6
        self.d = []
        self.w = []
        self.mPatterns = patternArray
        self.mNames = namesArray
        return

    def initialize_arrays(self):
        # create 2D array of random values between 0.0 and 1.0.
        for i in range(self.mMaxClusters):
            newArray = []
            for j in range(self.mVectorLen):
                newArray.append(random.random())

            self.w.append(newArray)

        return
    
    def compute_input(self, vectorArray):
        self.d = [0.0] * self.mMaxClusters
        
        for i in range(self.mMaxClusters):
            for j in range(self.mVectorLen):
                self.d[i] += math.pow((self.w[i][j] - vectorArray[j]), 2)
        
        return
    
    def get_minimum(self, nodeArray):
        minimum = 0
        foundNewMinimum = False
        done = False
    
        while not done:
            foundNewMinimum = False
            for i in range(self.mMaxClusters):
                if i != minimum:
                    if nodeArray[i] < nodeArray[minimum]:
                        minimum = i
                        foundNewMinimum = True
    
            if foundNewMinimum == False:
                done = True
    
        return minimum
    
    def update_weights(self, vectorNumber, dMin):
        y = 0
        pointA = 0
        pointB = 0
        done = False
        
        for i in range(self.mVectorLen):
            # Only include neighbors before radius reduction point is reached.
            if self.mAlpha > self.mReductionPoint:
                y = 1
                while not done:
                    if y == 1: # Top row of 3
                        if dMin > self.mXLength - 1:
                            pointA = dMin - self.mXLength - 1
                            pointB = dMin - self.mXLength + 1
                        else:
                            y = 2
                    
                    if y == 2: # Middle row of 3.
                        PointA = dMin - 1
                        # DMin is like an anchor position right between these two.
                        PointB = dMin + 1
                    
                    if y == 3: # Bottom row of 3.
                        if dMin < (self.mXLength * (self.mYLength - 1)):
                            pointA = dMin + self.mXLength - 1
                            pointB = dMin + self.mXLength + 1
                        else:
                            done = True
                    
                    if not done:
                        for j in range(pointA, pointB):
                            # Check if anchor is at left side.
                            if math.fmod(dMin, self.mXLength) == 0:
                                # Check if anchor is at top.
                                if j > pointA:
                                    self.w[j][i] = self.w[j][i] + (self.mAlpha * (self.mPatterns[vectorNumber][i] - self.w[j][i]))
                            
                            # Check if anchor is at right side.
                            elif math.fmod((dMin + 1), self.mXLength) == 0:
                                # Check if anchor is at top.
                                if j < pointB:
                                    self.w[j][i] = self.w[j][i] + (self.mAlpha * (self.mPatterns[vectorNumber][i] - self.w[j][i]))
                            
                            # Otherwise, anchor is not at either side.
                            else:
                                self.w[j][i] = self.w[j][i] + (self.mAlpha * (self.mPatterns[vectorNumber][i] - self.w[j][i]))
                    
                    if y == 3:
                        done = True
                    
                    y += 1
            
            elif self.mAlpha <= self.mReductionPoint:
                # Update only the winner.
                self.w[dMin][i] = self.w[dMin][i] + (self.mAlpha * (self.mPatterns[vectorNumber][i] - self.w[dMin][i]))
        
        return

    def training(self):
        iterations = 0
        reductionFlag = False
        reductionPoint = 0
        
        while self.mAlpha > self.mMinAlpha:
            iterations += 1
            sys.stdout.write("Training iteration: " + str(iterations) + "\n")
            
            for i in range(self.mNumPatterns):
                self.compute_input(self.mPatterns[i])
                
                dMin = self.get_minimum(self.d)
                
                self.update_weights(i, dMin)
            
            # Reduce the learning rate.
            self.mAlpha = self.mDecayRate * self.mAlpha
            
            # Reduce radius at specified point.
            if self.mAlpha < self.mReductionPoint:
                if reductionFlag == False:
                    reductionFlag = True
                    reductionPoint = iterations
        
        sys.stdout.write("Iterations: " + str(iterations) + "\n")
        
        sys.stdout.write("Neighborhood radius reduced after " + str(reductionPoint) + " iterations.\n")
        
        return
    
    def print_results(self):
        sys.stdout.write("Clusters for training input:\n")
        
        for i in range(self.mNumPatterns):
            self.compute_input(self.mPatterns[i])
            
            dMin = self.get_minimum(self.d)
            
            sys.stdout.write("Vector (")
            sys.stdout.write("Pattern " + str(i) + ", " + self.mNames[i])
            sys.stdout.write(") fits into category " + str(dMin) + "\n")
        return


if __name__ == '__main__':
    pattern = []
    pattern.append(A1)
    pattern.append(B1)
    pattern.append(C1)
    pattern.append(D1)
    pattern.append(E1)
    pattern.append(J1)
    pattern.append(K1)
    pattern.append(A2)
    pattern.append(B2)
    pattern.append(C2)
    pattern.append(D2)
    pattern.append(E2)
    pattern.append(J2)
    pattern.append(K2)
    pattern.append(A3)
    pattern.append(B3)
    pattern.append(C3)
    pattern.append(D3)
    pattern.append(E3)
    pattern.append(J3)
    pattern.append(K3)

    som = SOM_Class3(VEC_LEN, MAX_CLUSTERS, INPUT_PATTERNS, VEC_XLEN, VEC_YLEN, MIN_ALPHA, DECAY_RATE, RADIUS_REDUCTION_POINT, pattern, NAMES)
    som.initialize_arrays()
    som.training()
    som.print_results()
    