## Import Libraries

In [9]:
import os
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
max_features = 50
maxlen = 100

## read dataset

In [10]:
df=pd.read_csv("data.csv")
df=df.drop(["game_season",'team_name','date_of_game','home/away', 'match_id','team_id','match_event_id','lat/lng'], axis=1)
df_goal=df['is_goal']
df_id=df['shot_id_number']




## preprocess

In [11]:
#convert to string
df.area_of_shot=df.area_of_shot.astype(str)
df.shot_basics=df.shot_basics.astype(str)
df.range_of_shot=df.range_of_shot.astype(str)
df.type_of_shot=df.type_of_shot.astype(str)
df.type_of_combined_shot=df.type_of_combined_shot.astype(str)
df_text=df[["area_of_shot","shot_basics","range_of_shot","type_of_shot","type_of_combined_shot"]].copy()
df_text["text"] = df_text["area_of_shot"].map(str) + df_text["shot_basics"]+df_text["range_of_shot"]+df_text["type_of_shot"]+df_text["type_of_combined_shot"]
df_text.drop(['area_of_shot','shot_basics', 'range_of_shot', 'type_of_shot','type_of_combined_shot'], axis=1)
df=df.drop(['area_of_shot','shot_basics', 'range_of_shot', 'type_of_shot','type_of_combined_shot'], axis=1)

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x
df_text["text"] = df_text["text"].apply(lambda x: clean_text(x))
# lower
df_text["text"] = df_text["text"].apply(lambda x: x.lower())
df_text["text"] = df_text["text"].fillna("_##_").values
def add_features(df_text):
    
    df_text['text'] = df_text['text'].apply(lambda x:str(x))
    df_text['total_length'] = df_text['text'].apply(len)
    df_text['capitals'] = df_text['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df_text['caps_vs_length'] = df_text.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df_text['num_words'] = df_text.text.str.count('\S+')
    df_text['num_unique_words'] = df_text['text'].apply(lambda comment: len(set(w for w in comment.split())))
    df_text['words_vs_unique'] = df_text['num_unique_words'] / df_text['num_words']  
    return df_text

train = add_features(df_text)
features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
ss = StandardScaler()
ss.fit(features)
features = ss.transform(features)

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df_text['text']))
train_X = tokenizer.texts_to_sequences(df_text['text'])

## pad sequences
train_X = pad_sequences(train_X, maxlen=maxlen)

#word index
word_index=tokenizer.word_index

## EDA

In [12]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()
    

In [13]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [14]:
nRowsRead = None
df1 = pd.read_csv('data.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'data.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 30697 rows and 28 columns


In [15]:
#Distribution graphs (histogram/bar graph) of sampled columns:
#plotPerColumnDistribution(df1, 10, 5)


In [16]:
#Correlation matrix:
#plotCorrelationMatrix(df1, 8)

## word embeddings

In [17]:
def load_glove(word_index):
    EMBEDDING_FILE = 'glove.6B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding="utf8"))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    # Why random embedding for OOV? what if use mean?
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size)) # std 0
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = 'wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding="utf8") if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

glove_embeddings = load_glove(word_index)
#paragram_embeddings = load_para(word_index)
fasttext_embeddings = load_fasttext(word_index)
embedding_matrix = np.mean([glove_embeddings,fasttext_embeddings], axis=0)


  


## Scaling

In [18]:
text=pd.DataFrame(train_X)
df=df.join(text)
df_test=df.loc[df.is_goal.isnull()]
df=df.drop(df[df.is_goal.isnull()].index) 
df=df.drop("shot_id_number", axis=1)
df_goal1=df['is_goal']
df=df.drop("is_goal", axis=1)
df=df.join(df_goal1)
df = df.fillna(df.mean())
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df.iloc[:,:]) 
df.iloc[:,:] = scaled_values
df_test=df_test.drop('is_goal',axis=1)
df_test_shotid=df_test['shot_id_number']
df_test=df_test.drop('shot_id_number',axis=1)
df_test = df_test.fillna(df.mean())
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df_test.iloc[:,:]) 
df_test.iloc[:,:] = scaled_values





  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


## split the dataset

In [19]:
df_test = df_test.fillna(df.mean())
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df_test.iloc[:,:]) 
df_test.iloc[:,:] = scaled_values


In [20]:
X = pd.DataFrame(df.iloc[:,0:-1 ])
y = df.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.25, random_state = 42)


## training

In [21]:
from sklearn import metrics
def train_model(classifier, X_train, y_train, X_test, y_test,is_neural_net=False):
    classifier.fit(X_train, y_train,epochs=5)
    predictions = classifier.predict(X_test)
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_test)

In [24]:
len(word_index)

123

## RCNN

In [25]:
from keras import activations
from keras import backend as K
from keras.engine.topology import Layer

def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    return scale * x


#define our own softmax function instead of K.softmax
def softmax(x, axis=-1):
    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
    return ex/K.sum(ex, axis=axis, keepdims=True)


#A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, share_weights=True, activation='squash', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.share_weights = share_weights
        if activation == 'squash':
            self.activation = squash
        else:
            self.activation = activations.get(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        #final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:,:,:,0]) #shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            c = softmax(b, 1)
            o = K.batch_dot(c, u_hat_vecs, [2, 2])
            if K.backend() == 'theano':
                o = K.sum(o, axis=1)
            if i < self.routings - 1:
                o = K.l2_normalize(o, -1)
                b = K.batch_dot(o, u_hat_vecs, [2, 3])
                if K.backend() == 'theano':
                    b = K.sum(b, axis=1)

        return self.activation(o)

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [36]:
from keras.layers import K, Activation
from keras.engine import Layer
from keras.layers import LeakyReLU, Dense, Input, Embedding, Dropout, Bidirectional, GRU, Flatten, SpatialDropout1D
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Model

#from vendor.Capsule.Capsule_Keras import *

gru_len = 256
Routings = 3
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.25
rate_drop_dense = 0.28
embed_size = 300

def get_model():
    input1 = Input(shape=(113,))
    embed_layer = Embedding(max_features,
                            embed_size,
                            input_length=113)(input1)
    embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)

    x = Bidirectional(GRU(gru_len,
                          activation='relu',
                          dropout=dropout_p,
                          recurrent_dropout=dropout_p,
                          return_sequences=True))(embed_layer)
    capsule = Capsule(
        num_capsule=Num_capsule,
        dim_capsule=Dim_capsule,
        routings=Routings,
        share_weights=True)(x)

    capsule = Flatten()(capsule)
    capsule = Dropout(dropout_p)(capsule)
    capsule = LeakyReLU()(capsule)

    x = Flatten()(x)
    output = Dense(1, activation='softmax')(x)
    model1 = Model(inputs=input1, outputs=output)
    model1.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    model1.summary()

    return model1


model1 = get_model()
batch_size = 450
epochs = 5

model1.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
              validation_data=(X_valid, y_valid))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 113)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 113, 300)          15000     
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 113, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 113, 512)          855552    
_________________________________________________________________
flatten_6 (Flatten)          (None, 57856)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 57857     
Total params: 928,409
Trainable params: 928,409
Non-trainable params: 0
_________________________________________________________________
Trai

<keras.callbacks.History at 0x1e322f51518>

In [37]:
predictions = model1.predict(X_valid)


## mean absolute error

In [29]:
from sklearn.metrics import mean_absolute_error
mae=mean_absolute_error(y_valid,predictions)
print(mae)

0.4957044901961818


## Prediction and Evaluation

In [34]:
goal_predict=model.predict_proba(df_test)[:,1]
prob= pd.DataFrame(goal_predict)
prob['is_goal'] = prob[0]
prob = prob.drop([0], axis = 1)
df_test_shotid = df_test_shotid.fillna(df_test_shotid.mean())
df_test_shotid=pd.DataFrame(df_test_shotid)
df_test_shotid=df_test_shotid.join(prob['is_goal'])







In [35]:
df_test_shotid.to_csv(r'C:\Ghost Drive\study\research\zs/Abhishek_Kathuria_20041998_code_3.csv')
