In [9]:
import pandas as pd
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [10]:
DEBUG = False

### Data Loading

In [11]:
df_train_pos = pd.read_table('../data/train_pos.txt', header=None, names=['tweet'])
df_train_neg = pd.read_table('../data/train_neg.txt', header=None, names=['tweet'])
df_test = pd.read_table('../data/test_data.txt', header=None, names=['tweet'])

In [12]:
df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = 0

df_train = pd.concat([df_train_pos, df_train_neg])

### Preprocessing Pipeline

In [15]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m31m29.0 MB/s[0m eta [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [17]:
# spacy pipeline
# English pipeline optimized for CPU. 
# Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm')

# punctuation and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def tweet_cleaner(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if (token not in punctuations) and (token not in stop_words):
            clean_tokens.append(token)
    return clean_tokens

In [18]:
# custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        collector = []
        for text in tqdm(X, total=len(X), desc='Cleaning text:\t'):
            collector.append(clean_text(text))
        return collector
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

# basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()    

In [19]:
# different vectorizers
bow_vector = CountVectorizer(tokenizer=tweet_cleaner, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer=tweet_cleaner)

In [22]:
from sklearn.model_selection import train_test_split

X = df_train['tweet']
y = df_train['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (137879,)
y_train dimension: (137879,)
X_test dimension: (59091,)
y_train dimension: (59091,)


In [23]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# classifier = (verbose=1, solver='lbfgs', max_iter=10000)
classifier = MLPClassifier(hidden_layer_sizes=(256,128,64), verbose=True)

# Create pipeline using Bag of Words
components = [
    ("cleaner", predictors()),
    ("vectorizer", bow_vector),
    ("classifier", classifier)
        ]
pipe = Pipeline(components)

# Test with 1/100 of the data to estimate the time needed
before = time.time()
pipe.fit(X_train[:len(X_train)//100], y_train[:len(y_train)//100])
after = time.time()
print(f'\n\nTime needed for a 100th ({len(X_train)//100} samples): {after-before} s')
print(f'Time needed for the whole dataset ({len(X_train)} samples): {(after-before)*100} s\n\n')

# Model generation
pipe.fit(X_train, y_train)

Cleaning text:	: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 277259.47it/s]


Iteration 1, loss = 0.68061569
Iteration 2, loss = 0.59776759
Iteration 3, loss = 0.47261996
Iteration 4, loss = 0.31587777
Iteration 5, loss = 0.17399640
Iteration 6, loss = 0.08650218
Iteration 7, loss = 0.04766630
Iteration 8, loss = 0.03104038
Iteration 9, loss = 0.02054378
Iteration 10, loss = 0.01666669
Iteration 11, loss = 0.01401668
Iteration 12, loss = 0.01275923
Iteration 13, loss = 0.01189569
Iteration 14, loss = 0.01168823
Iteration 15, loss = 0.01049410
Iteration 16, loss = 0.01043669
Iteration 17, loss = 0.01033658
Iteration 18, loss = 0.00988739
Iteration 19, loss = 0.00995561
Iteration 20, loss = 0.00987296
Iteration 21, loss = 0.00930896
Iteration 22, loss = 0.00870731
Iteration 23, loss = 0.00867273
Iteration 24, loss = 0.00837874
Iteration 25, loss = 0.00851081
Iteration 26, loss = 0.00817281
Iteration 27, loss = 0.00804274
Iteration 28, loss = 0.00829492
Iteration 29, loss = 0.00934906
Iteration 30, loss = 0.00796220
Iteration 31, loss = 0.00994762
Iteration 32, los

Cleaning text:	: 100%|███████████████████████████████████████████████████████████████████████████████████| 137879/137879 [00:00<00:00, 2488602.56it/s]


In [25]:
from sklearn import metrics

# Predicting with test dataset
predicted = pipe.predict(X_test[:1000])

# Model accuracy score
print(f'Logistic Regression Accuracy: {metrics.accuracy_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Precision: {metrics.precision_score(y_test[:1000], predicted)}')
print(f'Logistic Regression Recall: {metrics.recall_score(y_test[:1000], predicted)}')

Cleaning text:	: 100%|████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 536493.22it/s]


Logistic Regression Accuracy: 0.803
Logistic Regression Precision: 0.8041825095057035
Logistic Regression Recall: 0.8181818181818182


In [27]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.utils.data import DataLoader, TensorDataset


batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(y_train) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader([ [X_train[i], y_train[i]] for i in range(len(y_train))], shuffle=True, batch_size=100)
test_loader = torch.utils.data.DataLoader([ [X_test[i], y_test[i]] for i in range(len(y_train))], shuffle=True, batch_size=100)



class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

KeyError: 6

In [32]:
X_train[7]

7    rt <user> <user> <user> yes she is ! u tell it...
7    workin hard or hardly workin rt <user> at hard...
Name: tweet, dtype: object

In [30]:
y_train[0]

0

In [33]:
cleaner

NameError: name 'cleaner' is not defined

In [35]:
clean = predictors().transform(X_train)

Cleaning text:	: 100%|███████████████████████████████████████████████████████████████████████████████████| 137879/137879 [00:00<00:00, 2386688.02it/s]


In [36]:
vect = bow_vector.transform(clean)

TypeError: 'CountVectorizer' object is not callable

In [40]:
vect2 = bow_vector.transform(clean)

In [42]:
from flair.data import Sentence
from flair.nn import Classifier

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = Classifier.load('sentiment')

# run NER over sentence
tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)

2023-06-23 10:39:00,814 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /var/folders/z9/kcp8xgns0fl1624stv5v38_m0000gn/T/tmpibic9w6u


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 253M/253M [00:26<00:00, 10.2MB/s]

2023-06-23 10:39:27,058 copying /var/folders/z9/kcp8xgns0fl1624stv5v38_m0000gn/T/tmpibic9w6u to cache at /Users/artiomgesp/.flair/models/sentiment-en-mix-distillbert_4.pt





2023-06-23 10:39:27,272 removing temp file /var/folders/z9/kcp8xgns0fl1624stv5v38_m0000gn/T/tmpibic9w6u


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Sentence[4]: "I love Berlin ." → POSITIVE (0.9983)


In [56]:
sentence = Sentence('I hate Berlin .')

# run NER over sentence
a = tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)

Sentence[4]: "I hate Berlin ." → NEGATIVE (0.9988)


In [58]:
sentence.tag

'NEGATIVE'

In [84]:
%time
nb_correct = 0

df = pd.concat([df_train_pos, df_train_neg])[0:1000]

to_int = lambda x: 1 if x == 'POSITIVE' else 0

for tweet, sentiment in zip(df['tweet'], df['sentiment']):
    sentence = Sentence(tweet)

    tagger.predict(sentence)
    if sentiment == to_int(sentence.tag):
        nb_correct += 1


CPU times: user 6 µs, sys: 5 µs, total: 11 µs
Wall time: 17.2 µs


In [72]:
df_train_pos['sentiment'].unique()

array([1])

In [83]:
pd.concat([df_train_pos, df_train_neg])[0:1000]

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
995,"<user> they are sold out , but if u have extra...",1
996,"i wanna love u , every day and every night !",1
997,relax ... rt <user> i just don't want to start...,1
998,"thanks to my pin up studio team yesterday , we...",1


In [85]:
nb_correct

599

In [90]:
predictors().transform(df['tweet'][0])

Cleaning text:	: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 442422.43it/s]


['<',
 'u',
 's',
 'e',
 'r',
 '>',
 '',
 'i',
 '',
 'd',
 'u',
 'n',
 'n',
 'o',
 '',
 'j',
 'u',
 's',
 't',
 'i',
 'n',
 '',
 'r',
 'e',
 'a',
 'd',
 '',
 'm',
 'y',
 '',
 'm',
 'e',
 'n',
 't',
 'i',
 'o',
 'n',
 '',
 'o',
 'r',
 '',
 'n',
 'o',
 't',
 '',
 '.',
 '',
 'o',
 'n',
 'l',
 'y',
 '',
 'j',
 'u',
 's',
 't',
 'i',
 'n',
 '',
 'a',
 'n',
 'd',
 '',
 'g',
 'o',
 'd',
 '',
 'k',
 'n',
 'o',
 'w',
 's',
 '',
 'a',
 'b',
 'o',
 'u',
 't',
 '',
 't',
 'h',
 'a',
 't',
 '',
 ',',
 '',
 'b',
 'u',
 't',
 '',
 'i',
 '',
 'h',
 'o',
 'p',
 'e',
 '',
 'y',
 'o',
 'u',
 '',
 'w',
 'i',
 'l',
 'l',
 '',
 'f',
 'o',
 'l',
 'l',
 'o',
 'w',
 '',
 'm',
 'e',
 '',
 '#',
 'b',
 'e',
 'l',
 'i',
 'e',
 'v',
 'e',
 '',
 '1',
 '5']

In [97]:
s = Sentence(df['tweet'][19432])

KeyError: 19432

In [94]:
tagger.predict(s)
s

Sentence[30]: "<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15" → POSITIVE (0.5695)

In [108]:
df_train_neg['tweet'][2]

'1-3 vs celtics in the regular season = were fucked if we play them in the playoffs'

In [103]:
print('a'*122)

aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa


In [110]:
len(df_test)

10000