In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
import numpy as np
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import chardet
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from keras.constraints import maxnorm
from tensorflow.keras.optimizers import Adamax 
from keras import regularizers
nltk.download('omw-1.4')

In [None]:
with open('../input/nlp-data/nlp_vader_textblob_classified_data.csv', 'rb') as f:
    enc = chardet.detect(f.read())  # or readline if the file is large
    
tweetData = pd.read_csv('../input/nlp-data/nlp_vader_textblob_classified_data.csv', encoding = enc['encoding'], index_col=False)
tweetData

In [None]:
labels = tweetData['final_class']
def featureEngineering(tweet):
    # Lower case tweet
    tweetMod = tweet.lower()
    # Replace URLs with a space in the message
    tweetMod = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweetMod)
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    tweetMod = re.sub('\$[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    tweetMod = re.sub('\@[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace everything not a letter or apostrophe with a space
    tweetMod = re.sub('[^a-zA-Z\']', ' ', tweetMod)
    # Remove single letter words
    tweetMod = ' '.join([w for w in tweetMod.split() if len(w) > 1])

    return tweetMod


# Process for all tweets
tweetData['modTweet'] = [featureEngineering(tweet) for tweet in tweetData['content']]

def lemmatizeTweet(tweet):
    words = [word for word in word_tokenize(tweet) if (word.isalpha()==1)]
    # Remove stop words
    stop = set(stopwords.words('english'))
    words = [word for word in words if (word not in stop)]
    # Lemmatize words (first noun, then verb)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
    return " ".join(lemmatized)

tweetData['lemmatizedText'] = tweetData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(tweetData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(tweetData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./overall_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./overall_loss.png')

In [None]:
model_dropout.save('./overall')
model_dropout.save('./overall.h5')

In [None]:
manuData = tweetData.loc[tweetData['primaryTeam'] == 'Manchester United']
mancData = tweetData.loc[tweetData['primaryTeam'] == 'ManCity']
chelseaData = tweetData.loc[tweetData['primaryTeam'] == 'Chelsea']
arsenalData = tweetData.loc[tweetData['primaryTeam'] == 'Arsenal']
liverpoolData = tweetData.loc[tweetData['primaryTeam'] == 'Liverpool']
totData = tweetData.loc[tweetData['primaryTeam'] == 'Tottenham']

# **Manchester United**

In [None]:
manuData

In [None]:
labels = manuData['final_class']
def featureEngineering(tweet):
    # Lower case tweet
    tweetMod = tweet.lower()
    # Replace URLs with a space in the message
    tweetMod = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', tweetMod)
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    tweetMod = re.sub('\$[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    tweetMod = re.sub('\@[a-zA-Z0-9]*', ' ', tweetMod)
    # Replace everything not a letter or apostrophe with a space
    tweetMod = re.sub('[^a-zA-Z\']', ' ', tweetMod)
    # Remove single letter words
    tweetMod = ' '.join([w for w in tweetMod.split() if len(w) > 1])

    return tweetMod


# Process for all tweets
manuData['modTweet'] = [featureEngineering(tweet) for tweet in manuData['content']]

def lemmatizeTweet(tweet):
    words = [word for word in word_tokenize(tweet) if (word.isalpha()==1)]
    # Remove stop words
    stop = set(stopwords.words('english'))
    words = [word for word in words if (word not in stop)]
    # Lemmatize words (first noun, then verb)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
    return " ".join(lemmatized)

manuData['lemmatizedText'] = manuData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(manuData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(manuData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./manu_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./manu_loss.png')

In [None]:
model_dropout.save('./manU')
model_dropout.save('./manU.h5')

# **Manchester City**

In [None]:
mancData

In [None]:
labels = mancData['final_class']
mancData['modTweet'] = [featureEngineering(tweet) for tweet in mancData['content']]
mancData['lemmatizedText'] = mancData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(mancData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(mancData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./manc_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./manc_loss.png')

In [None]:
model_dropout.save('./manC')
model_dropout.save('./manC.h5')

# **Arsenal**

In [None]:
arsenalData

In [None]:
labels = arsenalData['final_class']
arsenalData['modTweet'] = [featureEngineering(tweet) for tweet in arsenalData['content']]
arsenalData['lemmatizedText'] = arsenalData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(arsenalData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(arsenalData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./arsenal_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./arsenal_loss.png')

In [None]:
model_dropout.save('./arsenal')
model_dropout.save('./arsenal.h5')

# **Chelsea**

In [None]:
chelseaData

In [None]:
labels = chelseaData['final_class']
chelseaData['modTweet'] = [featureEngineering(tweet) for tweet in chelseaData['content']]
chelseaData['lemmatizedText'] = chelseaData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(chelseaData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(chelseaData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./chelsea_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./chelsea_loss.png')

In [None]:
model_dropout.save('./chesea')
model_dropout.save('./chelsea.h5')

# **Liverpool**

In [None]:
liverpoolData

In [None]:
labels = liverpoolData['final_class']
liverpoolData['modTweet'] = [featureEngineering(tweet) for tweet in liverpoolData['content']]
liverpoolData['lemmatizedText'] = liverpoolData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(liverpoolData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(liverpoolData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./liverpool_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./liverpool_loss.png')

In [None]:
model_dropout.save('./liverpool')
model_dropout.save('./liverpool.h5')

# **Tottenham**

In [None]:
totData

In [None]:
labels = totData['final_class']
totData['modTweet'] = [featureEngineering(tweet) for tweet in totData['content']]
totData['lemmatizedText'] = totData["modTweet"].apply(lambda x:lemmatizeTweet(x))

In [None]:
labels = tf.keras.utils.to_categorical(labels, 3, dtype="float32")
tokenizer = Tokenizer(num_words=37320, split=' ')
tokenizer.fit_on_texts(totData['lemmatizedText'].values)
X = tokenizer.texts_to_sequences(totData['lemmatizedText'].values)
X = pad_sequences(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model_dropout = Sequential()
model_dropout.add(Embedding(input_dim=37320, output_dim=40, input_length=X.shape[1]))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=256, return_sequences=True, kernel_initializer='he_uniform')))
model_dropout.add(Dropout(0.5))
model_dropout.add(Bidirectional(LSTM(units=128, return_sequences=False, kernel_initializer='he_uniform')))
model_dropout.add(Dense(3, activation='softmax'))
optimizer = Adamax(lr = 0.01)
model_dropout.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
print(model_dropout.summary())

In [None]:
history = model_dropout.fit(X_train, Y_train, epochs = 20, batch_size=512, validation_data=(X_test, Y_test))

In [None]:
# Plotting the accuracies
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./tot_acc.png')

In [None]:
# Plotting the losses
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('./tot_loss.png')

In [None]:
model_dropout.save('./tot')
model_dropout.save('./tot.h5')