In [54]:
import pandas as pd
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [45]:
DEBUG = False

### Data Loading

In [46]:
df_train_pos = pd.read_table('../data/train_pos.txt', header=None, names=['tweet'])
df_train_neg = pd.read_table('../data/train_neg.txt', header=None, names=['tweet'])
df_test = pd.read_table('../data/test_data.txt', header=None, names=['tweet'])

In [47]:
df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = 0

df_train = pd.concat([df_train_pos, df_train_neg])

### Preprocessing Pipeline

In [48]:
# spacy pipeline
# English pipeline optimized for CPU. 
# Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm')

# punctuation and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def tweet_cleaner(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if (token not in punctuations) and (token not in stop_words):
            clean_tokens.append(token)
    return clean_tokens

In [57]:
# custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        collector = []
        for text in tqdm(X, total=len(X), desc='Cleaning text:\t'):
            collector.append(clean_text(text))
        return collector
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

# basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()    

In [50]:
# different vectorizers
bow_vector = CountVectorizer(tokenizer=tweet_cleaner, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer=tweet_cleaner)

In [58]:
from sklearn.model_selection import train_test_split

X = df_train['tweet']
y = df_train['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (137879,)
y_train dimension: (137879,)
X_test dimension: (59091,)
y_train dimension: (59091,)


In [69]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# classifier = (verbose=1, solver='lbfgs', max_iter=10000)
classifier = MLPClassifier(hidden_layer_sizes=(256,128,64), verbose=True)

# Create pipeline using Bag of Words
components = [
    ("cleaner", predictors()),
    ("vectorizer", bow_vector),
    ("classifier", classifier)
        ]
pipe = Pipeline(components)

# Test with 1/100 of the data to estimate the time needed
before = time.time()
pipe.fit(X_train[:len(X_train)//100], y_train[:len(y_train)//100])
after = time.time()
print(f'\n\nTime needed for a 100th ({len(X_train)//100} samples): {after-before} s')
print(f'Time needed for the whole dataset ({len(X_train)} samples): {(after-before)*100} s\n\n')

# Model generation
pipe.fit(X_train, y_train)

Cleaning text:	: 100%|██████████| 137/137 [00:00<00:00, 122939.59it/s]




Iteration 1, loss = 0.69247731
Iteration 2, loss = 0.66372001
Iteration 3, loss = 0.63898151
Iteration 4, loss = 0.61510484
Iteration 5, loss = 0.58993782
Iteration 6, loss = 0.56279569
Iteration 7, loss = 0.53407789
Iteration 8, loss = 0.50367636
Iteration 9, loss = 0.47140838
Iteration 10, loss = 0.43733561
Iteration 11, loss = 0.40177655
Iteration 12, loss = 0.36550256
Iteration 13, loss = 0.32892263
Iteration 14, loss = 0.29260392
Iteration 15, loss = 0.25714077
Iteration 16, loss = 0.22307336
Iteration 17, loss = 0.19102697
Iteration 18, loss = 0.16147059
Iteration 19, loss = 0.13458109
Iteration 20, loss = 0.11075820
Iteration 21, loss = 0.09007257
Iteration 22, loss = 0.07244212
Iteration 23, loss = 0.05780148
Iteration 24, loss = 0.04582880
Iteration 25, loss = 0.03621006
Iteration 26, loss = 0.02856020
Iteration 27, loss = 0.02252354
Iteration 28, loss = 0.01780056
Iteration 29, loss = 0.01411686
Iteration 30, loss = 0.01125537
Iteration 31, loss = 0.00904044
Iteration 32, los

In [67]:
from sklearn import metrics

# Predicting with test dataset
predicted = pipe.predict(X_test[:1000])

# Model accuracy score
print(f'Logistic Regression Accuracy: {metrics.accuracy_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Precision: {metrics.precision_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Recall: {metrics.recall_score(y_test[:1000], predicted)}')

Cleaning text:	: 100%|██████████| 1000/1000 [00:00<00:00, 1006794.05it/s]


Logistic Regression Accuracy: 0.733
Logistic Regression Precision: 0.7376425855513308
Logistic Regression Recall: 0.7504835589941973
