In [10]:
import os
import time
import pandas as pd
import numpy as np
import csv
import string
import matplotlib.pyplot as plt
import seaborn as sns
import random
import itertools
import collections
from collections import Counter

import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import * 

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [11]:
# load data
training = pd.read_csv('feat_eng_train_data.csv')

# remove rows with none values
training = training.dropna(0, 'any')

# Features TODO: correct feature names
features = ['tokens', 'neu_scores', 'neg_scores', 'compound_scores', 'pos_scores']
label = ['score']

# Saving features and label data in X and y for train-test split
X = training[[col for col in training.columns if col in features]]
y = training[label]

# splitting data into training and validation set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

training.head(3)

Unnamed: 0,score,tokens,neg_scores,neu_scores,pos_scores,compound_scores,NOUN,PRON,VERB,ADJ,ADV
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,thought sleep option tomorrow realiz evalu mor...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,life cool,0.0,0.303,0.697,0.3182,1.0,0.0,1.0,0.0,0.0


In [12]:
# Helper functions from gracecarrillo

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
# Pipeline to convert tweets to a matrix of TF-IDF features.
tfidf = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('tfidf', TfidfVectorizer())
            ])

# Pipeline to convert tweets to a matrix of token counts
countvect = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('countvect', CountVectorizer())
            ])

# Applying tfidf anf countvec to features
neu_scores =  Pipeline([
                ('selector', NumberSelector(key='neu_scores')),
                ('minmax', MinMaxScaler())
            ])
neg_scores =  Pipeline([
                ('selector', NumberSelector(key='neg_scores')),
                ('minmax', MinMaxScaler())
            ])
pos_scores =  Pipeline([
                ('selector', NumberSelector(key='pos_scores')),
                ('minmax', MinMaxScaler())
            ])

compound_scores =  Pipeline([
                ('selector', NumberSelector(key='compound_scores')),
                ('minmax', MinMaxScaler())
            ])

In [13]:
# defining different sets of text processors
def features_union(textProcessor):
    return FeatureUnion([('tokens', textProcessor),
                      ('neu_scores', neu_scores),
                      ('neg_scores', neg_scores),
                      ('pos_scores', pos_scores),
                      ('compound_scores', compound_scores)])
# Normalise labels
le = LabelEncoder().fit(y_train.values.ravel()) #if error occurs, try removing or adding values before ravel

y_train = le.transform(y_train.values.ravel())
y_test = le.transform(y_test.values.ravel())

In [14]:
# Naive-Bayes Classifier

# instantiate classifier
clf = MultinomialNB()

# combine features
features_count = features_union(countvect)

# define pipeline object 
nb_pipeline = Pipeline([('features', features_count),
                       ('nb', clf)])

# Fit classifier
nb_pipeline.fit(X_train, y_train)

# score
nb_pipeline.score(X_test, y_test)

0.7382150043115838

In [15]:
# SVM Classifier

# instantiate classifier
svm = LinearSVC()

#  combine features
features_tfidf = features_union(tfidf)

# define pipeline object
svm_pipeline = Pipeline([('features', features_tfidf),
                       ('svm', svm)])

# Fit classifier
svm_pipeline.fit(X_train, y_train.ravel())

# score
svm_pipeline.score(X_test, y_test.ravel())

0.7342627191721759

In [17]:
import time

# cross valiadation for Naive-Bayes Classifier

# instantiate pipeline object
nb_pipeline = Pipeline([('feats', features_tfidf),  ('clf', MultinomialNB())])

# parameter grid (3x3x2x2x3x3x2) combinations
parameters = {
    'feats__tokens__tfidf__max_df': (0.5, 0.75, 1.0),
    'feats__tokens__tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)), 
    'feats__tokens__tfidf__use_idf': (False, True),
    'feats__tokens__tfidf__binary':(False, True),
    'feats__tokens__tfidf__binary':('l1', 'l2', None),
    'clf__alpha': (1.0, 5.0, 10.0),
    'clf__fit_prior': (True, False),     
}

# instantiate GridSearchCV object with pipeline and parameters with 3-folds cross-validation
nb_grid = GridSearchCV(nb_pipeline, parameters, cv=3)

# start time 
nb_start = time.time()

# Fit 
nb_grid.fit(X_train, y_train)

# end time 
nb_end = time.time()
print(f"Time taken to run: {round((nb_end - nb_start)/60,1)} minutes")

# Check score
print(nb_grid.score(X_test, y_test))

nb_cv_results = pd.DataFrame(nb_grid.cv_results_)

Time taken to run: 12.1 minutes
0.7510060362173038


In [21]:
# cross validation for SVM Classifier

# instantiate pipeline
svm_count_pipeline = Pipeline([('feats', features_count),  ('clf', LinearSVC())])

# parameter grid (3x3x2x3x7x2) combinations
parameters = {
    'feats__tokens__countvect__max_df': (0.5, 0.75, 1.0),
    'feats__tokens__countvect__ngram_range': ((1, 1), (1, 2), (2, 2)), 
    #'feats__tokens__countvect__use_idf': (False, True),
    'clf__loss': ('hinge', 'squared_hinge'),
    'clf__C': (0.1, 0.5, 0.6, 1, 4, 5, 10, 100),
    'clf__class_weight': (None, 'balanced')                                    
}

# instantiate GridSearchCV object with pipeline and parameters with 3-folds cross-validation
svm_grid = GridSearchCV(svm_count_pipeline, parameters, cv=3)

# start time 
svm_start = time.time()

# fit
svm_grid.fit(X_train, y_train)

# end time 
svm_end = time.time()
print(f"Time taken to run: {round((svm_end - svm_start)/60,1)} minutes")

# score
svm_grid.score(X_test, y_test)

svm_cv_results = pd.DataFrame(svm_grid.cv_results_)

Time taken to run: 25.5 minutes


In [None]:
!pip install tensorflow

In [None]:
!pip install keras

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM
import torch

In [None]:
# Parameter indicating the number of words
nb_words = 10000  

# create the tokenizer (tweets have been preprocessed so no need for filters)
tk = Tokenizer(num_words=nb_words)

# fit the tokenizer on tweets
tk.fit_on_texts(training.tokens)

# integer encode tweets
tweets_seq = tk.texts_to_sequences(training.tokens)

# TODO need to update based, uncomment line below to see what the max is
# print(training['word count'].describe())
max_len = 39

# Convert sequences into 2-D Numpy arrays
features = pad_sequences(tweets_seq, maxlen=max_len)

In [None]:
training["label"] = training["label"].astype("category")
# print(training.label.describe())

labels = pd.get_dummies(training['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
#--- Parameters----#

# encodes input sequence dense vectors 
embed_dim = 128

# transforms the vector sequence into a single vector
lstm_out = 200

# batch size of 32 is a good starting point
batch_size = 32

# epochs
nb_epoch = 10

#------# Build the LSTM model #-----------------#
reg_model = Sequential()
reg_model.add(Embedding(2500, embed_dim, input_length = features.shape[1], dropout = 0.2))
reg_model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
reg_model.add(Dense(2, kernel_regularizer=regularizers.l2(0.001), activation='softmax'))

# Compile model
reg_model.compile( optimizer='adam', # optimazer
              loss = 'categorical_crossentropy', # loss function
              metrics = ['accuracy']) # list of metrics

reg_model.name = 'LSTM with Regularisation model'
print(reg_model.summary())

In [None]:
# Fit the model
reg_history = reg_model.fit(X_train, Y_train, 
                    validation_split=0.33, 
                    batch_size = batch_size, 
                    nb_epoch = nb_epoch, verbose = True)

In [None]:
# TODO: update to be the file path that we want

# save model and architecture
reg_model.save('LSTM_regmodel.h5')

#-- LSTM model ----#
y_preds_LSTM = model.predict(X_test)

# Save predictions for evaluation as numpy arrays
np.save('y_predsLSTM.npy', y_preds_LSTM)

#-- LSTM with regularisation model ----#
y_preds_LSTMreg = reg_model.predict(X_test)

# Save predictions for evaluation as numpy arrays
np.save('y_predsLSTMreg.npy', y_preds_LSTMreg)

# Save test data
np.save('y_testLSTM.npy', Y_test)