In [None]:
import pandas as pd
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [None]:
DEBUG = False

### Data Loading

In [None]:
df_train_pos = pd.read_table('../data/train_pos.txt', header=None, names=['tweet'])
df_train_neg = pd.read_table('../data/train_neg.txt', header=None, names=['tweet'])
df_test = pd.read_table('../data/test_data.txt', header=None, names=['tweet'])

In [None]:
df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = 0

df_train = pd.concat([df_train_pos, df_train_neg])

### Preprocessing Pipeline

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
# spacy pipeline
# English pipeline optimized for CPU. 
# Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm')

# punctuation and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def tweet_cleaner(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if (token not in punctuations) and (token not in stop_words):
            clean_tokens.append(token)
    return clean_tokens

In [None]:
# custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        collector = []
        for text in tqdm(X, total=len(X), desc='Cleaning text:\t'):
            collector.append(clean_text(text))
        return collector
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

# basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()    

In [None]:
# different vectorizers
bow_vector = CountVectorizer(tokenizer=tweet_cleaner, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer=tweet_cleaner)

In [None]:
from sklearn.model_selection import train_test_split

X = df_train['tweet']
y = df_train['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

In [None]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# classifier = (verbose=1, solver='lbfgs', max_iter=10000)
classifier = MLPClassifier(hidden_layer_sizes=(256,128,64), verbose=True)

# Create pipeline using Bag of Words
components = [
    ("cleaner", predictors()),
    ("vectorizer", bow_vector),
    ("classifier", classifier)
        ]
pipe = Pipeline(components)

# Test with 1/100 of the data to estimate the time needed
before = time.time()
pipe.fit(X_train[:len(X_train)//100], y_train[:len(y_train)//100])
after = time.time()
print(f'\n\nTime needed for a 100th ({len(X_train)//100} samples): {after-before} s')
print(f'Time needed for the whole dataset ({len(X_train)} samples): {(after-before)*100} s\n\n')

# Model generation
pipe.fit(X_train, y_train)

In [None]:
from sklearn import metrics

# Predicting with test dataset
predicted = pipe.predict(X_test[:1000])

# Model accuracy score
print(f'Logistic Regression Accuracy: {metrics.accuracy_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Precision: {metrics.precision_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Recall: {metrics.recall_score(y_test[:1000], predicted)}')

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.utils.data import DataLoader, TensorDataset


batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(y_train) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader([ [X_train[i], y_train[i]] for i in range(len(y_train))], shuffle=True, batch_size=100)
test_loader = torch.utils.data.DataLoader([ [X_test[i], y_test[i]] for i in range(len(y_train))], shuffle=True, batch_size=100)



class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

In [None]:
X_train[7]

In [None]:
y_train[0]

In [None]:
cleaner

In [None]:
clean = predictors().transform(X_train)

In [None]:
vect = bow_vector.transform(clean)

In [None]:
vect2 = bow_vector.transform(clean)

In [None]:
from flair.data import Sentence
from flair.nn import Classifier

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = Classifier.load('sentiment')

# run NER over sentence
tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)

In [None]:
sentence = Sentence('I hate Berlin .')

# run NER over sentence
a = tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)

In [None]:
sentence.tag

In [None]:
%time
nb_correct = 0

df = pd.concat([df_train_pos, df_train_neg])[0:1000]

to_int = lambda x: 1 if x == 'POSITIVE' else 0

for tweet, sentiment in zip(df['tweet'], df['sentiment']):
    sentence = Sentence(tweet)

    tagger.predict(sentence)
    if sentiment == to_int(sentence.tag):
        nb_correct += 1


In [None]:
df_train_pos['sentiment'].unique()

In [None]:
pd.concat([df_train_pos, df_train_neg])[0:1000]

In [None]:
nb_correct

In [None]:
predictors().transform(df['tweet'][0])

In [None]:
s = Sentence(df['tweet'][19432])

In [None]:
tagger.predict(s)
s

In [None]:
df_train_neg['tweet'][2]

In [None]:
print('a'*122)

In [None]:
len(df_test)