In [67]:
import pandas as pd
import numpy as np
import csv
import string
import matplotlib.pyplot as plt

# load data TODO: add file name
training = pd.read_csv('feat_eng_train_data.csv')

# remove rows with none values
training = training.dropna(0, 'any')

# Features TODO: correct feature names
features = ['tokens', 'neu_scores', 'neg_scores', 'compound_scores', 'pos_scores']
label = ['score']

# Saving features and label data in X and y for train-test split
X = training[[col for col in training.columns if col in features]]
y = training[label]

# splitting data into training and validation set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

training.head(3)

Unnamed: 0,score,tokens,neg_scores,neu_scores,pos_scores,compound_scores,NOUN,PRON,VERB,ADJ,ADV
0,0,awww that bummer shoulda got david carr third day,0.245,0.755,0.0,-0.3818,1.0,0.0,1.0,1.0,0.0
1,0,upset cant updat facebook text might cri resul...,0.286,0.714,0.0,-0.4588,1.0,0.0,2.0,1.0,1.0
2,0,dive mani time ball manag save 50 rest go bound,0.0,0.738,0.262,0.4939,2.0,0.0,1.0,1.0,0.0


In [78]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [79]:
# Helper functions from gracecarrillo

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
# Pipeline to convert tweets to a matrix of TF-IDF features.
tfidf = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('tfidf', TfidfVectorizer())
            ])

# Pipeline to convert tweets to a matrix of token counts
countvect = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('countvect', CountVectorizer())
            ])

# Applying tfidf anf countvec to features
neu_scores =  Pipeline([
                ('selector', NumberSelector(key='neu_scores')),
                ('minmax', MinMaxScaler())
            ])
neg_scores =  Pipeline([
                ('selector', NumberSelector(key='neg_scores')),
                ('minmax', MinMaxScaler())
            ])
pos_scores =  Pipeline([
                ('selector', NumberSelector(key='pos_scores')),
                ('minmax', MinMaxScaler())
            ])

compound_scores =  Pipeline([
                ('selector', NumberSelector(key='compound_scores')),
                ('minmax', MinMaxScaler())
            ])

In [83]:
# defining different sets of text processors
def features_union(textProcessor):
    return FeatureUnion([('tokens', textProcessor),
                      ('neu_scores', neu_scores),
                      ('neg_scores', neg_scores),
                      ('pos_scores', pos_scores),
                      ('compound_scores', compound_scores)])
# Normalise labels
le = LabelEncoder().fit(y_train.ravel())

y_train = le.transform(y_train.ravel())
y_test = le.transform(y_test.ravel())

In [84]:
# Naive-Bayes Classifier

# instantiate classifier
clf = MultinomialNB()

# combine features
features_count = features_union(countvect)

# define pipeline object 
nb_pipeline = Pipeline([('features', features_count),
                       ('nb', clf)])

# Fit classifier
nb_pipeline.fit(X_train, y_train)

# score
nb_pipeline.score(X_test, y_test)

0.7624393183341571

In [85]:
# SVM Classifier

# instantiate classifier
svm = LinearSVC()

#  combine features
features_tfidf = features_union(tfidf)

# define pipeline object
svm_pipeline = Pipeline([('features', features_tfidf),
                       ('svm', svm)])

# Fit classifier
svm_pipeline.fit(X_train, y_train.ravel())

# score
svm_pipeline.score(X_test, y_test.ravel())

0.7667725139982532

In [88]:
import time

# cross valiadation for Naive-Bayes Classifier

In [98]:
# cross validation for Naive-Bayes Classifier

In [108]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.4.1-cp38-cp38-manylinux2010_x86_64.whl (394.4 MB)
[K     |████████████████████████████████| 394.4 MB 637 kB/s eta 0:00:011   |█▉                              | 22.1 MB 9.7 MB/s eta 0:00:39     |██                              | 24.4 MB 9.7 MB/s eta 0:00:39     |███████████████                 | 183.8 MB 27.4 MB/s eta 0:00:08     |█████████████████████▉          | 268.6 MB 19.4 MB/s eta 0:00:07     |███████████████████████▎        | 287.3 MB 28.2 MB/s eta 0:00:04     |████████████████████████        | 295.6 MB 28.2 MB/s eta 0:00:04     |████████████████████████▏       | 298.3 MB 28.2 MB/s eta 0:00:04     |██████████████████████████▋     | 328.1 MB 20.1 MB/s eta 0:00:04

In [105]:
!pip install keras



In [106]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM
import torch

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [109]:
# Parameter indicating the number of words
nb_words = 10000  

# create the tokenizer (tweets have been preprocessed so no need for filters)
tk = Tokenizer(num_words=nb_words)

# fit the tokenizer on tweets
tk.fit_on_texts(training.tokens)

# integer encode tweets
tweets_seq = tk.texts_to_sequences(training.tokens)

# TODO need to update based, uncomment line below to see what the max is
# print(training['word count'].describe())
max_len = 39

# Convert sequences into 2-D Numpy arrays
features = pad_sequences(tweets_seq, maxlen=max_len)

NameError: name 'Tokenizer' is not defined

In [None]:
training["label"] = training["label"].astype("category")
# print(training.label.describe())

labels = pd.get_dummies(training['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
#--- Parameters----#

# encodes input sequence dense vectors 
embed_dim = 128

# transforms the vector sequence into a single vector
lstm_out = 200

# batch size of 32 is a good starting point
batch_size = 32

# epochs
nb_epoch = 10

#------# Build the LSTM model #-----------------#

print('lets goooo...') 

# Initialising the RNN
model = Sequential()

#adding an input layer and the first hidden layer
model.add(Embedding(2500, embed_dim, 
                    input_length = features.shape[1], 
                    dropout = 0.2)) 
# Adding the second hidden layer
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
# Adding the output layer
model.add(Dense(2, activation='softmax'))

# Compile model
model.compile( optimizer='adam', # optimazer
              loss = 'categorical_crossentropy', # loss function
              metrics = ['accuracy']) # list of metrics

model.name = 'LSTM model'
print(model.summary())

In [None]:
# Fit the model
history = model.fit(X_train, Y_train, 
                    validation_split=0.33, 
                    batch_size = batch_size, 
                    nb_epoch = nb_epoch, verbose = True)