In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

## Data preparation

In [2]:
df = pd.read_csv('./Twitter_Data.csv')

In [3]:
df.head(5)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
df.shape

(162980, 2)

In [5]:
df[df.isnull().any(axis=1)]

Unnamed: 0,clean_text,category
148,,0.0
130448,the foundation stone northeast gas grid inaugu...,
155642,dear terrorists you can run but you cant hide ...,
155698,offense the best defence with mission shakti m...,
155770,have always heard politicians backing out thei...,
158693,modi government plans felicitate the faceless ...,
158694,,-1.0
159442,chidambaram gives praises modinomics,
159443,,0.0
160559,the reason why modi contested from seats 2014 ...,


In [6]:
df = df.dropna(axis=0)

In [7]:
#Check if all rows are non-null
len(df[df.isnull().any(axis=1)]) == 0

True

In [8]:
#New df shape
df.shape

(162969, 2)

In [10]:
df['category'].value_counts()

 1.0    72249
 0.0    55211
-1.0    35509
Name: category, dtype: int64

In [12]:
obs_num = min(df['category'].value_counts())

df_class_plus1 = df[df['category'] == 1]
df_class_0 = df[df['category'] == 0]
df_class_minus1 = df[df['category'] == -1]

sample_plus1 = df_class_plus1.sample(obs_num)
sample_0 = df_class_0.sample(obs_num)
sample_minus1 = df_class_minus1.sample(obs_num)

#Concat and shuffle
df_new = pd.concat([sample_plus1, sample_0, sample_minus1], axis=0).sample(frac=1)

In [13]:
df_new['category'].value_counts()

 1.0    35509
 0.0    35509
-1.0    35509
Name: category, dtype: int64

In [15]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_new['clean_text'], df_new['category'], 
                                                    test_size = 0.3, random_state = 123)

# prepare cross validation
# kf = KFold(n_splits=2, random_state=123, shuffle=True)

## Text Classification Methods

- BERT
https://simpletransformers.ai/docs/classification-models/
https://towardsdatascience.com/a-beginners-guide-to-use-bert-for-the-first-time-2e99b8c5423
- SpaCy Tokenization and Lemmatization + NB/SVM

### Approach 1: SpaCy

In [14]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Get nltk set of english words
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /Users/bryankoh/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

Creating custom tokenizer using SpaCy to tokenize and lemmatize our text

In [16]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Set nltk vocab
vocab = set(nltk.corpus.words.words())

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token
    mytokens = [ token.lemma_ for token in mytokens if token.lemma_ in vocab]

    # Removing stop words
    mytokens = [ token for token in mytokens if token not in stop_words ]

    # return preprocessed list of tokens
    return mytokens

Approach 1a: Include 1-grams only

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
spacy_vect_1a = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [19]:
train_dtf_1a = spacy_vect_1a.fit_transform(X_train)

In [20]:
X_train_dtf_1a = pd.DataFrame(train_dtf_1a.toarray(), columns=spacy_vect_1a.get_feature_names())

In [21]:
test_dtf_1a = spacy_vect_1a.transform(X_test)

In [22]:
X_test_dtf_1a = pd.DataFrame(test_dtf_1a.toarray(), columns=spacy_vect_1a.get_feature_names())

Approach 1b: Include 1-grams and 2-grams

In [25]:
spacy_vect_1b = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2))

In [26]:
train_dtf_1b = spacy_vect_1b.fit_transform(X_train)

In [27]:
X_train_dtf_1b = pd.DataFrame(train_dtf_1b.toarray(), columns=spacy_vect_1b.get_feature_names())

In [28]:
test_dtf_1b = spacy_vect_1b.transform(X_test)

In [29]:
X_test_dtf_1b = pd.DataFrame(test_dtf_1b.toarray(), columns=spacy_vect_1b.get_feature_names())

Approach 1c: Remove infrequent terms

In [30]:
spacy_vect_1c = CountVectorizer(tokenizer = spacy_tokenizer, max_features=1000)
train_dtf_1c = spacy_vect_1c.fit_transform(X_train)
X_train_dtf_1c = pd.DataFrame(train_dtf_1c.toarray(), columns=spacy_vect_1c.get_feature_names())
test_dtf_1c = spacy_vect_1c.transform(X_test)
X_test_dtf_1c = pd.DataFrame(test_dtf_1c.toarray(), columns=spacy_vect_1c.get_feature_names())

Helper function for obtaining training and test metrics

Opting for MultinomialNB, LinearSVC and LogisticRegression based on this article:
https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

In [41]:
from sklearn import metrics

#Insert document-term matrices, y-values and model
def test_model(X_train_dtm, y_train, X_test_dtm, y_test, model):
    model.fit(X_train_dtm, y_train)
    y_pred = model.predict(X_test_dtm)
    
    print('Training accuracy: ', metrics.accuracy_score(y_train, model.predict(X_train_dtm)))
    print('Test accuracy: ', metrics.accuracy_score(y_test, y_pred))
    
from sklearn.naive_bayes import MultinomialNB    
def test_nb(X_train_dtm, y_train, X_test_dtm, y_test):
    nb = MultinomialNB()
    test_model(X_train_dtm, y_train, X_test_dtm, y_test, nb)
    
from sklearn.svm import LinearSVC
def test_svc(X_train_dtm, y_train, X_test_dtm, y_test):
    svc = LinearSVC()
    test_model(X_train_dtm, y_train, X_test_dtm, y_test, svc)
    
from sklearn.linear_model import LogisticRegression
def test_lr(X_train_dtm, y_train, X_test_dtm, y_test):
    lr = LogisticRegression(random_state=0)
    test_model(X_train_dtm, y_train, X_test_dtm, y_test, lr)

In [43]:
print('Approach 1a: 1-grams only\n')
test_nb(X_train_dtf_1a, y_train, X_test_dtf_1a, y_test)
print('------------------------')
test_svc(X_train_dtf_1a, y_train, X_test_dtf_1a, y_test)
print('------------------------')
test_lr(X_train_dtf_1a, y_train, X_test_dtf_1a, y_test)

Approach 1a: 1-grams only

Training accuracy:  0.7656501448342452
Test accuracy:  0.7062799211489721
------------------------




Training accuracy:  0.8807397274970497
Test accuracy:  0.7950186176038049
------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training accuracy:  0.8623538246969209
Test accuracy:  0.7978347257423574


In [None]:
print('Approach 1b: 1-grams and 2-grams only\n')
test_nb(X_train_dtf_1b, y_train, X_test_dtf_1b, y_test)
print('------------------------')
test_svc(X_train_dtf_1b, y_train, X_test_dtf_1b, y_test)
print('------------------------')
test_lr(X_train_dtf_1b, y_train, X_test_dtf_1b, y_test)

Approach 1b: 1-grams and 2-grams only



In [None]:
print('Approach 1c: Remove infrequent terms\n')
test_nb(X_train_dtf_1c, y_train, X_test_dtf_1c, y_test)
print('------------------------')
test_svc(X_train_dtf_1c, y_train, X_test_dtf_1c, y_test)
print('------------------------')
test_lr(X_train_dtf_1c, y_train, X_test_dtf_1c, y_test)

In [249]:
from sklearn.svm import LinearSVC

#Linear SVC
svc = LinearSVC()
test_model(X_train_dtf, y_train, X_test_dtf, y_test, svc)



Training accuracy:  0.9328095238095238
Test accuracy:  0.7663333333333333


In [250]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
test_model(X_train_dtf, y_train, X_test_dtf, y_test, lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training accuracy:  0.900952380952381
Test accuracy:  0.772


### Approach 2: BERT

In [32]:
# Map category values to start from 0
df_bert = deepcopy(df_new)
df_bert['category'] = df_bert['category'].map({-1: 0, 0: 1, 1: 2})

In [33]:
from simpletransformers.classification import ClassificationModel
# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-base', num_labels=3, use_cuda=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [34]:
X_bert_train, X_bert_test, y_bert_train, y_bert_test = train_test_split(df_bert['clean_text'], df_bert['category'], 
                                                    test_size = 0.3, random_state = 123)

In [35]:
df_bert_train = pd.concat([X_bert_train, y_bert_train], axis=1)
df_bert_test = pd.concat([X_bert_test, y_bert_test], axis=1)

In [37]:
model.train_model(df_bert_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=9321.0, style=ProgressStyle(de…







KeyboardInterrupt: 

In [38]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

NameError: name 'eval_df' is not defined