## Library and data imports##

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# review = "Please note that this is purely my experience of the game during the stupid hours I have sunk into it. I won't be completely critical of the game and I'll give credit to a couple of things, most notably the gameplay. I like the animations and the motion of the players, they are fun to control. I play on normal gameplay speed and it feels great. Players are responsive and it is very fun trying to keep shape defensively (especially against ultimate difficulty) but with practice, you get there. Attacking is satisfying too, I like the different dribbling styles, although R1 dribbling at the time of this review is a little overpowered, but even still, I am happy that it's possible to dribble past players in this year's game and score some lovely solo goals. I play purely offline, as where I am currently located I just cannot get a good enough connection to have any kind of online experience. HOWEVER - despite EA releasing updates, not one of them has combated the biggest issue myself and many MANY others face, which is the abhorrent optimisation which is leading to stutters and for some, lag and game crashes. I would not want to know how many of those hours I have spent trialling fixes and it is the first time in my gaming experience to encounter a game that just won't play. Sometimes a workaround will fix things temporarily, but upon the next launch, it just doesn't work again. This to me is unacceptable. They are aware of the problem, yet nothing has changed now for well over a month of it being released. The hell I've been through to sort it should never be required to play any game. I paid £80, but I feel like EA owe me a year's salary for the amount of work I've done on their game for them. Crooks. Stutters in these kinds of games destroy any pleasurable experience you can get from it. Imagine playing an F1 game, but before every corner it stutters for 3 seconds and when it fixes itself, you've crashed into an entire neighbouring village and caused £500 million worth of damage, you'd say 'Forget it' and toss it in the bin. The same goes for football games. You need precision and reactions because timing is important. You can't do that with 3-second stutters intermittently during a match. How can nobody have seen this? I feel like Mugatu from Zoolander. It was a new era for the franchise, a fresh start, so I allowed myself to believe that things may be different, but it just isn't the case for any of the offline modes. If you think adding cut scenes of players winning the Balon d'Or (where nobody even speaks other than some announcer, which freaks me out) and an open-top bus parade of trophies you've won (which is cool the first time, but it is the same every single time after) then you don't understand why we play football games - WE WANT TO PLAY, NOT SIT THERE AND WATCH SILENT MOVIES. Is this their attempt at making it more immersive? Disaster. True, some promising ideas were introduced, like tactical vision and hiring coaches in manager mode, but here's the thing - that was bugged when it was released. Imagine working on a new feature, but for it not to work properly. What I mean is, that you can't fire coaches, you'll lock the game and have no chance to back out. I actually hope it doesn't get patched, because it doesn't matter, it tells us what we already know - they don't care. It's a shame because I think that if they'd just put more effort into it, this could've been a really good game. The foundation is there with the gameplay, and that's the heart of any good football game. But why not just stop being a bunch of fraggles and develop your offline game modes more? The amount of work they put into 'create a team' for example, which was just uninspiring, is what I am talking about. Put that work into actually making a comprehensive, immersive career mode. 'Create a team' was fantastic in the F1 game, it works in that circumstance, but for a manager mode on a football game? It isn't what we want. For those who can't be bothered to read all that: you'll feel just like Mugatu from Zoolander - It really is the same face."
# review2 = 'MWIII is the worst campaign of all time. 14 short missions and half are just spec ops missions from MWII, dropping you in a section of the warzone map and giving you objectives. It was like a warzone tutorial. The story is also far less than adequate. This seems like it should have been a DLC for MWII . I have never been more upset with a game. The multiplayer needs to be phenomenal for this game to be worth buying at all.'

# reviews = [review, review2]
df = pd.read_csv('dataset.csv')
reviews = df['review']
y_pre = df['voted_up']


## Dealing with imbalanced data ##
This would be where we deal with the fact that most of the reviews are positive and will probably do subsampling

In [12]:
from random import sample

up = df[df['voted_up'] == True]
down = df[df['voted_up'] == False]
print(len(up)/len(down))

df_sub = down
df_sub = pd.concat([df_sub, (up.sample(n=len(down)))])

print(len(down))
print(len(df_sub))

5.649185001096411
27362
54724


In [4]:
reviews = np.array(df_sub['review'])
y_pre = np.array(df_sub['voted_up'])

## Tokenization ##
Can cover the following hyperparameters (and their tuning):
- How do we deal with contractions?
- What kind of normalization do we do? (convert accents to ascii? stemming? lemmatization?)
- Do we use stopwords? Which ones?
- Other CountVectorizer hyperparams
- How do we handle punctuation?  
- How much should we subsample? (Need to determine performance metric first)

In [5]:
# preprocessing so the model can understand the reviews
pp_reviews = []
y = []

# filters out reviews with non-ascii characters -- we get an error if we don't include this. Need to narrow down the exact cause if possible.
for i in range(0, len(reviews)-1):
    if str(reviews[i]).isascii():
        pp_reviews.append(str(reviews[i]))
        y.append(y_pre[i])

In [6]:
def get_testing_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

    sgd_classifier = SGDClassifier(random_state=42)
    sgd_classifier.fit(X_train, y_train)
    y_pred = sgd_classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [7]:
def do_vectorization_transformation(count_vect, X):
    # tokenizes the reviews
    X_train_counts = count_vect.fit_transform(np.array(X))

    # transforms them so we deal with term frequencies rather than term counts
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X_train_counts)
    return X

In [8]:
# configuring hyperparameters for the count vectorizer

# baseline
count_vect = CountVectorizer()
X = do_vectorization_transformation(count_vect, pp_reviews)
print("baseline: " + str(get_testing_accuracy(X, y)))

baseline: 0.973395026026605


In [12]:
# max_df/min_df
for min_df in np.arange(0, 1.1, 0.1):
    for max_df in np.arange(0, 1.1, 0.1):
        try:
            count_vect = CountVectorizer(min_df=min_df, max_df=max_df)
            X = do_vectorization_transformation(count_vect, pp_reviews)
            print("min_df: " + str(min_df) + ", max_df: " + str(max_df) + ": " + str(get_testing_accuracy(X, y)))
        except:
            print("min_df: " + str(min_df) + ", max_df: " + str(max_df) + ": " + "invalid combination")

min_df: 0.0, max_df: 0.0: invalid combination
min_df: 0.0, max_df: 0.1: 0.9713541666666666
min_df: 0.0, max_df: 0.2: 0.9721257716049383
min_df: 0.0, max_df: 0.30000000000000004: 0.9754050925925926
min_df: 0.0, max_df: 0.4: 0.9729938271604939
min_df: 0.0, max_df: 0.5: 0.9729938271604939
min_df: 0.0, max_df: 0.6000000000000001: 0.9734760802469136
min_df: 0.0, max_df: 0.7000000000000001: 0.9734760802469136
min_df: 0.0, max_df: 0.8: 0.9734760802469136
min_df: 0.0, max_df: 0.9: 0.9734760802469136
min_df: 0.0, max_df: 1.0: 0.9734760802469136
min_df: 0.1, max_df: 0.0: invalid combination
min_df: 0.1, max_df: 0.1: invalid combination
min_df: 0.1, max_df: 0.2: 0.6706211419753086
min_df: 0.1, max_df: 0.30000000000000004: 0.6829668209876543
min_df: 0.1, max_df: 0.4: 0.6921296296296297
min_df: 0.1, max_df: 0.5: 0.6990740740740741
min_df: 0.1, max_df: 0.6000000000000001: 0.7091049382716049
min_df: 0.1, max_df: 0.7000000000000001: 0.7091049382716049
min_df: 0.1, max_df: 0.8: 0.7091049382716049
min_d

With all other factors equal, the best min_df/max_df combination was 0.0, 0.1 with a testing accuracy of 0.9803 when using an SVM

In [20]:
# with/without stopwords
# taken from NLTK stopword set, stripped of punctuation
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'dont', 'should', 'shouldve', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt']
count_vect = CountVectorizer(stop_words=stopwords)
X = do_vectorization_transformation(count_vect, pp_reviews)
print("with stopwords: " + str(get_testing_accuracy(X, y)))

with stopwords: 0.9703896604938271


The testing performance with stopwords was slightly better than without, with an improvement of 0.0002

In [21]:
# n-gram values
for min_n in np.arange(1, 11):
    for max_n in np.arange(min_n, 11):
        try:
            count_vect = CountVectorizer(ngram_range=(min_n, max_n))
            X = do_vectorization_transformation(count_vect, pp_reviews)
            print("min_df: " + str(min_n) + ", max_df: " + str(max_n) + ": " + str(get_testing_accuracy(X, y)))
        except:
            print("min_df: " + str(min_n) + ", max_df: " + str(max_n) + ": " + "invalid combination")

min_df: 1, max_df: 1: 0.9734760802469136
min_df: 1, max_df: 2: 0.9779128086419753
min_df: 1, max_df: 3: 0.9740547839506173
min_df: 1, max_df: 4: 0.9710648148148148
min_df: 1, max_df: 5: 0.9683641975308642
min_df: 1, max_df: 6: 0.9664351851851852
min_df: 1, max_df: 7: 0.9653742283950617
min_df: 1, max_df: 8: 0.9645061728395061
min_df: 1, max_df: 9: 0.9638310185185185
min_df: 1, max_df: 10: 0.9627700617283951
min_df: 2, max_df: 2: 0.9616126543209876
min_df: 2, max_df: 3: 0.9559220679012346
min_df: 2, max_df: 4: 0.9529320987654321
min_df: 2, max_df: 5: 0.9506172839506173
min_df: 2, max_df: 6: 0.9488811728395061
min_df: 2, max_df: 7: 0.9481095679012346
min_df: 2, max_df: 8: 0.9478202160493827
min_df: 2, max_df: 9: 0.9477237654320988
min_df: 2, max_df: 10: 0.9478202160493827
min_df: 3, max_df: 3: 0.9345100308641975
min_df: 3, max_df: 4: 0.9333526234567902
min_df: 3, max_df: 5: 0.9333526234567902
min_df: 3, max_df: 6: 0.9333526234567902
min_df: 3, max_df: 7: 0.9333526234567902
min_df: 3, max

With all other factors equal, the best n-gram combination was (1, 2) with a testing accuracy of 0.9870 when using an SVM

In [28]:
# using the optimal configuration from all previous hyperparameter testing
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'dont', 'should', 'shouldve', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt']
count_vect = CountVectorizer(ngram_range=(1, 2),min_df=0.0, max_df=0.1, stop_words=stopwords)
X = do_vectorization_transformation(count_vect, pp_reviews)
print("testing accuracy with optimal configuration with each individual preprocessing hyperparameter: " + str(get_testing_accuracy(X, y)))

testing accuracy with optimal configuration with each preprocessing hyperparameter: 0.9838476729448522


## Model Training ##

Will need to have consideration of how we measure performance (it likely isn't pure accuracy. Precision? Recall?)

In [10]:
from sklearn.metrics import confusion_matrix

count_vect = CountVectorizer(ngram_range=(1, 2))
X = do_vectorization_transformation(count_vect, pp_reviews)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(X_train, y_train)
y_pred = sgd_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(get_testing_accuracy(X, y))

[[5109  200]
 [  22 5043]]
0.9786003470213996


In [11]:
from sklearn.neighbors import KNeighborsClassifier
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=123)
for k in range(1, 10):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_subtrain,y_subtrain)
    y_predict = neigh.predict(X_val)
    print(k, accuracy_score(y_val, y_predict))

1 0.970310391363023
2 0.970310391363023
3 0.9568151147098516
4 0.9568151147098516
5 0.9552727973780606
6 0.9556583767110083
7 0.9550800077115866
8 0.9539232697127434
9 0.9527665317139001
