In [2]:
import pandas as pd
import numpy as np
import itertools
import json

import string as s
from collections import OrderedDict
from random import randint

from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn import metrics
from matplotlib import pyplot as plt

In [3]:
df = pd.read_csv("match_chat_data.csv")

In [4]:
df = df[0:1000]

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Match Id,Match Duration,Radiant Chat,Dire Chat,Radiant Win
0,0,3548935602,2517,"[('GOTTEM', 1500), ('LOL', 1578), ('The triple...","[('GOTTEM', 1570), ('NIOCE MEDUSA', 2525)]",True
1,1,3548935303,2644,"[('peru', 8), ('peru', 33), ('peru', 1587), ('...","[('MATENLO AL PUDGE ESTA SOLO ARRIBA :V', -14)...",True
2,2,3548935205,3629,"[('is he coming back ?', 228), ('hes not comin...","[('+', 228), ('СУКА', 369), ('no', 377), ('thx...",False
3,3,3548935007,2514,"[('leave our jungle you monkey', -24), (""i'm a...","[('xD', -20), ('shut up tiger', -12), ('hahahh...",True
4,4,3548934300,2339,[],"[('aw', 383), ('coming', 525), ('babi', 680), ...",False


We now create the dictionary of possible words to create our feature vectors

In [6]:
def get_word_dict(df):
    """Get a dictionary of unique words where key is the word and value is the order it was added."""
    word_dict = {}
    count = 0
    
    for row in df.itertuples():
        #get radiant words
        radiant_chat = row[4]
        dire_chat = row[5]
        
        #remove brackets and parantheses, and single quotes
        radiant_chat = radiant_chat.replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("'", "")
        dire_chat = dire_chat.replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("'", "")
        
        all_chat = radiant_chat + ", " + dire_chat
        
        #remove punctuation and replace with spaces
        for i in s.punctuation:
            all_chat = all_chat.replace(i, " ")
        all_chat = all_chat.lower()
        for word in all_chat.split(" "):
            if not word.isdigit() and word not in word_dict:
                word_dict[word] = count
                count += 1
    
    return word_dict
        
        
        

In [7]:
word_dict = get_word_dict(df)
len(word_dict)

10716

Next we form our feature vectors for each teams chat so there will be number of games x 2 feature vectors as we will ahve a feature vector for each team in a game (dire and radiant). Feature vectors will be of the form 

In [8]:
def generate_feature_matrix(df, word_dict, portion_of_match=1):
    """Takes in the dataframe of match information (chat and outcome) and 
    creates feature matrix of chat feature vectors.
    df - dataframe of match data 
    word_dict - dictionary of all possible words used
    portion_of_match - number between (0,1) indicating which part of the match you want to take words from
                        with 1 being all the match and 0 being none of it.
    """
   
    feature_matrix = []
    labels = [] # 0 indicates a loss 1 is a win
    
    for row in df.itertuples():
        #get radiant and dire chats
        radiant_chat = row[4]
        dire_chat = row[5]
        radiant_win = row[6]
        match_duration = int(row[3])
        match_portion_max_time = portion_of_match * match_duration
        
        #for each feature vector, we append them in the order radiant then dire
        if radiant_win == True:
            labels.append(1)
            labels.append(0)
        else:
            labels.append(0)
            labels.append(1)
        
        #remove brackets and parantheses, and single quotes
        radiant_chat = radiant_chat.replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("'", "")
        dire_chat = dire_chat.replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("'", "")
        
        #remove punctuation and replace with spaces
        for i in s.punctuation:
            radiant_chat = radiant_chat.replace(i, " ")
            dire_chat = dire_chat.replace(i, " ")
        
        radiant_chat = radiant_chat.lower()
        dire_chat = dire_chat.lower()
        
        radiant_vector = np.zeros(len(word_dict))
        dire_vector = np.zeros(len(word_dict))
        
        skip_next = False
        for word in radiant_chat.split():
            if not word.isdigit() and skip_next == False:
                radiant_vector[word_dict[word]] = 1
            if word.isdigit() and int(word) <= match_portion_max_time:
                skip_next = False
            else:
                skip_next = True
                
        skip_next = False 
        for word in dire_chat.split():
            if not word.isdigit():
                dire_vector[word_dict[word]] = 1
            if word.isdigit() and int(word) <= match_portion_max_time:
                skip_next = False
            else:
                skip_next = True
        
        feature_matrix.append(radiant_vector)
        feature_matrix.append(dire_vector)
        
    feature_matrix = np.asmatrix(feature_matrix)
    labels = np.asarray(labels)
    return feature_matrix, labels
        
        
        

In [9]:
feature_matrix, labels = generate_feature_matrix(df, word_dict)

In [10]:
def cv_performance(clf, X, y, k=5):
    """
    Splits the data, X and y, into k-folds and runs k-fold crossvalidation:
    training a classifier on K-1 folds and testing on the remaining fold.
    Calculates the k-fold crossvalidation performance metric for classifier
    clf by averaging the performance across folds.
    Input:
    clf- an instance of SVC()
    X- (n,d) array of feature vectors, where n is the number of examples
       and d is the number of features
    y- (n,) array of binary labels {1,-1}
    k- int specificyin the number of folds (default=5)
    Returns: average 'test' performance across the k folds as np.float64
    """
    skf = StratifiedKFold(n_splits = k);
    iteration = 0;
    average = 0;
    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train);
        
        print("for iteration " + str(iteration));
        accuracy = metrics.accuracy_score(y_test, clf.predict(X_test));
        average = average + accuracy;
        print(str(accuracy));
        iteration = iteration + 1;
    
    average = average / k;
    return average;

In [11]:
clf = SVC(kernel = 'linear', class_weight = 'balanced')

In [12]:
cv_performance(clf, feature_matrix, labels)

for iteration 0
0.625
for iteration 1
0.615
for iteration 2
0.5975
for iteration 3
0.6325
for iteration 4
0.64


0.622

We next try the same approach but this time training on a neural net with 1-layer and 100 neurons.

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100))

In [17]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [18]:
predictions = mlp.predict(X_test)

In [19]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
print(confusion_matrix(y_test,predictions))

[[115 137]
 [ 84 164]]


In [20]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.58      0.46      0.51       252
          1       0.54      0.66      0.60       248

avg / total       0.56      0.56      0.55       500



In [21]:
print(accuracy_score(y_test, predictions))

0.558


The neural net with 1 hidden layer and 100 neurons gives us only a 56% accuracy. Lets try adding 2 more hidden layers 

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100))

In [23]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [24]:
predictions = mlp.predict(X_test)

In [25]:
print(accuracy_score(y_test, predictions))

0.552


Increasing the number of hidden layers did not do much to accuracy. Lets play with the number of neurons now in each layer and see what effect that has.

In [27]:
mlp = MLPClassifier(hidden_layer_sizes=(500,500,500))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)
print(accuracy_score(y_test, predictions))

0.588


Increasing the number of neurons decreased our accuracy by 4 percent. Lets go back to 100 neurons per layer and add in another layer.

In [28]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)
print(accuracy_score(y_test, predictions))

0.584


In [29]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)
print(accuracy_score(y_test, predictions))

0.586


# Match Prediction at Different Points of The Match

We now use our model to predict the match outcome based on chat from using only a quarter of the match, half the match, then three quarters of the match. We hypothesize that the accuracy will increase monotonically as losing players will express their frustrations more in chat as the game swings more in the winning teams favor.

In [40]:
feature_matrix_quarter_match, labels = generate_feature_matrix(df, word_dict, .25)
feature_matrix_half_match, labels = generate_feature_matrix(df, word_dict, .5)
feature_matrix_three_quarter_match, labels = generate_feature_matrix(df, word_dict, .75)

In [31]:
cv_performance(clf, feature_matrix_quarter_match, labels)

for iteration 0
0.545
for iteration 1
0.5675
for iteration 2
0.5625
for iteration 3
0.5675
for iteration 4
0.56


0.5605

In [32]:
cv_performance(clf, feature_matrix_half_match, labels)

for iteration 0
0.595
for iteration 1
0.5775
for iteration 2
0.575
for iteration 3
0.5975
for iteration 4
0.555


0.57999999999999996

In [41]:
cv_performance(clf, feature_matrix_three_quarter_match, labels)

for iteration 0
0.61
for iteration 1
0.58
for iteration 2
0.5625
for iteration 3
0.6125
for iteration 4
0.585


0.59000000000000008