<a href="https://colab.research.google.com/github/adesam146/nlpcw/blob/sam_preprocessing/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [3]:
train = pd.read_csv("OffensEval_task_data/start-kit/training-v1/offenseval-training-v1.tsv", delimiter="\t")
print(train.shape)
print(train.head())

(13240, 5)
      id                                              tweet subtask_a  \
0  86426  @USER She should ask a few native Americans wh...       OFF   
1  90194  @USER @USER Go home you’re drunk!!! @USER #MAG...       OFF   
2  16820  Amazon is investigating Chinese employees who ...       NOT   
3  62688  @USER Someone should'veTaken" this piece of sh...       OFF   
4  43605  @USER @USER Obama wanted liberals &amp; illega...       NOT   

  subtask_b subtask_c  
0       UNT       NaN  
1       TIN       IND  
2       NaN       NaN  
3       UNT       NaN  
4       NaN       NaN  


In [4]:
train.count()

id           13240
tweet        13240
subtask_a    13240
subtask_b     4400
subtask_c     3876
dtype: int64

In [5]:
total = train['id'].count().item()
off_count = train[train['subtask_a'] == "OFF"]['id'].count()

print("Number of offensive", off_count)
print("Number of inoffensive", total - off_count)

Number of offensive 4400
Number of inoffensive 8840


**The above shows that the training dataset is not very balanced (in offensive is about twice as much). How could this be addressed. Get more data? Augment offensive comments by adding neutral words to create more data or concat offensive and inoffensive comments to make new offensive comments?**

In [6]:
training_percent = 0.8
training_size = int(training_percent * total)
validation_size = total - training_size

corpus = train['tweet'].to_numpy()
labels = train['subtask_a']
labels[labels == 'OFF'] = 1
labels[labels == 'NOT'] = 0
labels = labels.to_numpy(dtype=np.float)

indices = list(range(total))
np.random.shuffle(indices)
training_sents = corpus[indices[:training_size]]
training_labels = labels[indices[:training_size]]

validation_sents = corpus[indices[training_size:]]
validation_labels = labels[indices[training_size:]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
import re

def get_tokenised_corpus(corpus):
    """
    This assumes the corpus can be iterated through and
    retains the order in which the sentences appeared in the corpus
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    for sentence in corpus:
        tokenized_sentence = []
        for token in re.split(r'\s', sentence.lower()): # simplest split is 
            if token:
              # To avoid the empty string
              tokenized_sentence.append(token)
        tokenized_corpus.append(tokenized_sentence)
    
    return tokenized_corpus

In [8]:
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer(stop_words='english')
vec_training = vectorizer.fit_transform(training_sents)

tf_transformer = TfidfTransformer(use_idf=False)
vec_training = tf_transformer.fit_transform(vec_training)

In [9]:
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={1.0: 2})

clf.fit(vec_training, training_labels)



SGDClassifier(alpha=0.001, average=False, class_weight={1.0: 2},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [10]:
vec_valid = tf_transformer.transform(vectorizer.transform(validation_sents))
predictions = clf.predict(vec_valid)

In [11]:
from sklearn import metrics

print(metrics.classification_report(validation_labels, predictions))
print("Accuracy:", metrics.accuracy_score(validation_labels, predictions))

              precision    recall  f1-score   support

         0.0       0.74      0.93      0.82      1757
         1.0       0.71      0.35      0.47       891

   micro avg       0.73      0.73      0.73      2648
   macro avg       0.73      0.64      0.65      2648
weighted avg       0.73      0.73      0.70      2648

Accuracy: 0.7337613293051359


In [12]:
from textblob import TextBlob
train_pos_sents = []
train_pos_labels = []
train_neg_sents = []
train_neg_labels = []

for sent, label in zip(training_sents, training_labels):
    if TextBlob(sent).sentiment.polarity > -0.2:
        train_pos_sents.append(sent)
        train_pos_labels.append(label)
    else:
        train_neg_sents.append(sent)
        train_neg_labels.append(label)

In [59]:
clf1 = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={1.0: 2})
clf1.fit(tf_transformer.transform(vectorizer.transform(train_pos_sents)), train_pos_labels)

SGDClassifier(alpha=0.001, average=False, class_weight={1.0: 2},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [60]:
clf2 = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={0.0: 1.15})
clf2.fit(tf_transformer.transform(vectorizer.transform(train_neg_sents)), train_neg_labels)

SGDClassifier(alpha=0.001, average=False, class_weight={0.0: 1.15},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
valid_pos_sents = []
valid_pos_labels = []
valid_neg_sents = []
valid_neg_labels = []

for sent, label in zip(validation_sents, validation_labels):
    if TextBlob(sent).sentiment.polarity > -0.2:
        valid_pos_sents.append(sent)
        valid_pos_labels.append(label)
    else:
        valid_neg_sents.append(sent)
        valid_neg_labels.append(label)

In [61]:
preds1 = clf1.predict(tf_transformer.transform(vectorizer.transform(valid_pos_sents)))
preds2 = clf2.predict(tf_transformer.transform(vectorizer.transform(valid_neg_sents)))

predictions_joined = np.concatenate((preds1, preds2), axis=0)

In [62]:
print("Classifier for positive sentiments")
print(metrics.classification_report(valid_pos_labels, preds1))
print("Accuracy:", metrics.accuracy_score(valid_pos_labels, preds1))

print()

print("Classifier for negative sentiments")
print(metrics.classification_report(valid_neg_labels, preds2))
print("Accuracy:", metrics.accuracy_score(valid_neg_labels, preds2))

print()

print("Overall classifier")
print(metrics.classification_report(validation_labels, predictions_joined))
print("Accuracy:", metrics.accuracy_score(validation_labels, predictions_joined))

Classifier for positive sentiments
              precision    recall  f1-score   support

         0.0       0.75      0.97      0.85      1604
         1.0       0.75      0.21      0.32       648

   micro avg       0.75      0.75      0.75      2252
   macro avg       0.75      0.59      0.59      2252
weighted avg       0.75      0.75      0.70      2252

Accuracy: 0.7517761989342806

Classifier for negative sentiments
              precision    recall  f1-score   support

         0.0       0.58      0.79      0.67       153
         1.0       0.83      0.64      0.72       243

   micro avg       0.70      0.70      0.70       396
   macro avg       0.71      0.72      0.70       396
weighted avg       0.73      0.70      0.70       396

Accuracy: 0.6994949494949495

Overall classifier
              precision    recall  f1-score   support

         0.0       0.67      0.87      0.75      1757
         1.0       0.36      0.15      0.21       891

   micro avg       0.62      0.62