# Offensive Language Classification
This project aims at identifying whether the offensive language in the tweet is targeted towards a person, group, or organization (SUBTASK B)


## Importing necessary Libraries

In [5]:
import pandas as pd
import numpy as np
import nltk 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm

from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import re
import os

## Reading dataset into Pandas Dataframe

In [2]:
data = pd.read_csv('C:/Users/wjaya/Downloads/OLIDv1.0/olid-training-v1.0.tsv', sep = '\t')

data = data.fillna('NULL')
data.head(10)

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH
6,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,
7,52415,@USER was literally just talking about this lo...,OFF,TIN,GRP
8,45157,@USER Buy more icecream!!!,NOT,,
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,TIN,IND


## Data pre-processing

### Creating sub-functions to perform data-processing:
The sub-functions aim to:
- remove Tagged users, URLS, and ampersands
- get the Part-of-Speech tag for words within the tweet
- tokenize and lemmatize tweets
- count number of users tagged in a tweet


In [3]:
lemmatizer = WordNetLemmatizer() 
stops = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
def script_preprocessing(df):
    
    def removewords(text): # Remove these words.
        
        text = text.replace('@USER','',50)
        text = text.strip('URL')
        text = text.replace('&amp','',10)
        return text

    def get_wordnet_pos(tag):

        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    
    def pos_lemma(text):
        
        tokens = [i for i in tokenizer.tokenize(str(removewords(text)).lower()) if i not in stops] # if we don't convert text to str, it rises TypeError: expected string or bytes-like object   
        tagged = pos_tag(tokens)
        lemlist = [lemmatizer.lemmatize(i[0], get_wordnet_pos(i[1])) for i in tagged]
        lemmas = ' '.join(lemlist).lower()

        return lemmas
    
    def countuser(text):
        splitted_text = text.lower().split()
        user_count = 0
        for word in splitted_text 
            word = re.sub("[#@]","",word)
            word = re.sub("!"," !",word)
            word = re.sub("[?]"," ?",word)
            if(word == 'user'):
                user_count += 1
        return user_count
                        
    def finalize(df):
        
        df['pos_lemmatized'] = [pos_lemma(i) for i in df['tweet']]
        df['user_count'] = [countuser(i) for i in df['tweet']]
        return df
        
    return finalize(df)

In [4]:
result = script_preprocessing(data)
result.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,pos_lemmatized,user_count
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,ask native american take,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,go home drunk maga trump2020,3
2,16820,Amazon is investigating Chinese employees who ...,NOT,,,amazon investigate chinese employee sell inter...,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,someone vetaken piece shit volcano,1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,obama want liberal illegals move red state,2


In [5]:
# Calculating ratio for number of users tagged in a post as number of users in the post vs maximum number of users in a post
max_ = result['user_count'].max()
result['user_count'] = result['user_count'].apply(lambda x:x/max_)
result.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,pos_lemmatized,user_count
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,ask native american take,0.02
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,go home drunk maga trump2020,0.06
2,16820,Amazon is investigating Chinese employees who ...,NOT,,,amazon investigate chinese employee sell inter...,0.0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,someone vetaken piece shit volcano,0.02
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,obama want liberal illegals move red state,0.04


In [6]:
#Removing rows having a NULL value for the target variable
ind = []
for i in range(len(result['subtask_b'])):
    if result['subtask_b'][i] == 'NULL':
        ind.append(i)
result.drop(axis=0,labels = ind,inplace = True)
result.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,pos_lemmatized,user_count
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,ask native american take,0.02
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,go home drunk maga trump2020,0.06
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,someone vetaken piece shit volcano,0.02
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH,liberal kookoo,0.02
6,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,,oh no tough shit,0.04


### Creating dataframes for X and y:
X contains the POS Lemmatized tweets and y contains the target column subtask_b which is 'UNT' when untargeted and 'TIN' when the tweet is targeted

In [7]:
X = result[['pos_lemmatized',"user_count"]]
y = result['subtask_b']
label_to_number = {'UNT':0,'TIN':1}
number_to_label = {v:k for k,v in label_to_number.items()}
Y = result.subtask_b.apply(lambda x:label_to_number[x])

### Creating a function to upsample and downsample the data to reduce class-imbalance

In [8]:
def same_size_data(X_train, count_train, y_train, ratio_down_over_up=0.5):
    X_train = list(X_train)
    count_train = list(count_train)
    y_train = list(y_train)
  
    n_cat = len(Counter(y_train))
  
    sorted_counter = Counter(y_train).most_common()
    max_cat = sorted_counter[0][1]
    min_cat = sorted_counter[-1][1]

    target = min_cat + (1-ratio_down_over_up)*(max_cat - min_cat)

    for i in range(n_cat):
        diff = int(sorted_counter[i][1] - target)
        k = 0
        if diff > 0:
            rm = 0    
            while rm <= diff:
                if(y_train[k] == sorted_counter[i][0]):
                    X_train.pop(k)
                    y_train.pop(k)
                    count_train.pop(k)
                    rm += 1
                    k -=1
                k += 1
        else:
            ad = 0
            while ad <= -diff:
                if(y_train[k] == sorted_counter[i][0]):
                    X_train.append(X_train[k])
                    y_train.append(y_train[k])
                    count_train.append(count_train[k])
                    ad += 1
                k += 1

    return X_train, count_train, y_train

### Splitting the dataset into training dataset and testing dataset and performing shuffling

In [9]:
X_train_b, count_train_b, y_train_b = same_size_data(X.pos_lemmatized, X.user_count, Y, 0.2)
X_train_b, count_train_b, y_train_b = shuffle(X_train_b, count_train_b, y_train_b)

### Creating a bag of words model taking in the top 5000 words

In [10]:
# Bag of words model.
all_words_list = []
for sent in data['pos_lemmatized']:
    for word in sent.split(' '):
        all_words_list.append(word)
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(5000)

word_features = [word for (word,count) in word_items]
print(word_features[:30])

#word_features = tokenizer.texts_to_sequences(word_features)

['liberal', 'gun', 'like', 'control', 'get', 'people', 'go', 'shit', 'antifa', 'say', 'fuck', 'maga', 'know', 'conservative', 'think', 'trump', 'one', 'make', 'u', 'want', 'right', 'need', 'good', 'woman', 'would', 'democrat', 'lie', 'see', 'time', 'take']


### Using TF-IDF Vectorizer to obtain TF-IDF for each word in the training dataset

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
def TFIDFmatrix(X_train,word_features):
    vectorizer = TfidfVectorizer()
    fit = vectorizer.fit(word_features)
    X = vectorizer.transform(X_train)
    X = X.todense()
    return X
    
X_train_b = TFIDFmatrix(X_train_b,word_features)


### Creating a function to perform 5-fold cross validation for models implemented and print its respective model performance 

In [12]:
def training(X,y,clf):
    kf = KFold(n_splits=5, shuffle=True, random_state = 52)
    iteration_index = 0
    acc_list_scores = []
    f1_list_scores = []
    pre_list_scores=[]
    re_list_scores=[]
    for train_indexes, test_indexes in kf.split(X, y):
        iteration_index += 1

        X_train = X[train_indexes]
        y_train = y[train_indexes]

        X_test = X[test_indexes]
        y_test = y[test_indexes]

        #logreg = linear_model.LogisticRegression(C=1e5)
        clf.fit(X_train, y_train)
        y_predict = logreg.predict(X_test)

        current_acc = accuracy_score(y_test, y_predict)
        current_pre=precision_score(y_test,y_predict,average='macro',labels=np.unique(y_predict))
        current_recall=recall_score(y_test, y_predict,average='macro',labels=np.unique(y_predict))    
        current_f1 = f1_score(y_test, y_predict, average='macro', labels=np.unique(y_predict))
        
  
        print("Iteration #{0}: Accuracy : {1}, Precision : {2}, Recall : {2}, F-score : {2}".format(
            iteration_index,current_acc, current_pre,current_recall,current_f1))
        acc_list_scores.append(current_acc)
        pre_list_scores.append(current_pre)
        re_list_scores.append(current_recall)
        f1_list_scores.append(current_f1)
        

    print("Accuracy: {0}".format(np.mean(acc_list_scores)))
    print("Precision: {0}".format(np.mean(pre_list_scores)))
    print("Recall: {0}".format(np.mean(re_list_scores)))
    print("F1-measure: {0}".format(np.mean(f1_list_scores)))
    
    return clf


### Linear Regression Model

In [13]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
logreg = linear_model.LogisticRegression(C=1e5, solver='liblinear')
LRmodel = training(X_train_b,np.array(y_train_b),logreg)

Iteration #1: Accuracy : 0.8908807482462977, Precision : 0.9091973798236224, Recall : 0.9091973798236224, F-score : 0.9091973798236224
Iteration #2: Accuracy : 0.8845553822152886, Precision : 0.9002695417789757, Recall : 0.9002695417789757, F-score : 0.9002695417789757
Iteration #3: Accuracy : 0.8806552262090483, Precision : 0.9056720098643649, Recall : 0.9056720098643649, F-score : 0.9056720098643649
Iteration #4: Accuracy : 0.8884555382215289, Precision : 0.9046080557843796, Recall : 0.9046080557843796, F-score : 0.9046080557843796
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.8894085989472315
Precision: 0.9084982231610225
Recall: 0.8893290249394947
F1-measure: 0.8878918502457024


### Gaussian Naive Bayes Model

In [14]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
NBmodel = training(X_train_b,np.array(y_train_b),nb)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Gradient Boosting Classifier Model

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
GBmodel = training(X_train_b,np.array(y_train_b),gb)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Support Vector Machine Model

In [16]:
### SVM
from sklearn.svm import LinearSVC, SVC
svm = SVC(gamma='scale')
svmmodel= training(X_train_b,np.array(y_train_b),svm)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Random Forest Model

In [17]:
rf = RandomForestClassifier(n_estimators=30)
rfModel=training(X_train_b,np.array(y_train_b),rf)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Voting Classifier Model

In [18]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('logreg', logreg), ('nb', nb), ('gb', gb), ('svm', svm), ('rf', rf)])
eclfModel=training(X_train_b,np.array(y_train_b),eclf)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Bagging Classifier Model

In [19]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
cart = DecisionTreeClassifier()
num_trees = 100
bc = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
bcModel=training(X_train_b,np.array(y_train_b),bc)

Iteration #1: Accuracy : 0.9883086515978177, Precision : 0.9886535552193646, Recall : 0.9886535552193646, F-score : 0.9886535552193646
Iteration #2: Accuracy : 0.9914196567862714, Precision : 0.990909090909091, Recall : 0.990909090909091, F-score : 0.990909090909091
Iteration #3: Accuracy : 0.9867394695787831, Precision : 0.9874074074074074, Recall : 0.9874074074074074, F-score : 0.9874074074074074
Iteration #4: Accuracy : 0.9906396255850234, Precision : 0.9905660377358491, Recall : 0.9905660377358491, F-score : 0.9905660377358491
Iteration #5: Accuracy : 0.9024960998439937, Precision : 0.9227441285537701, Recall : 0.9227441285537701, F-score : 0.9227441285537701
Accuracy: 0.9719207006783778
Precision: 0.9760560439650965
Recall: 0.970595297507133
F1-measure: 0.9713649438261728


### Creating a function to test the model on the test dataset and print performance metrics

In [20]:
def testify(x_test,y_test,model):
    y_predict = model.predict(x_test)
    acc = accuracy_score(y_test,y_predict)
    f1 = f1_score(y_test, y_predict, average = 'macro')
    print('acc:',acc)
    print('f1:',f1)
    

### Testing all the above models with the test dataset

In [21]:
# Import Test_data.
testdata = pd.read_csv('C:/Users/wjaya/Downloads/OLIDv1.0/testset-levelb.tsv', sep = '\t',engine='python',encoding = 'utf-8-sig')
testdata = testdata.fillna('NULL')
testdata = script_preprocessing(testdata)
x_test = TFIDFmatrix(testdata['pos_lemmatized'],word_features)
y_test = pd.read_csv('C:/Users/wjaya/Downloads/OLIDv1.0/labels-levelb.csv',sep = ',',engine='python',header = None,encoding = 'utf-8-sig')
y_test = y_test[1].apply(lambda x:label_to_number[x])


testify(x_test,y_test,LRmodel)
testify(x_test,y_test,NBmodel)
testify(x_test,y_test,GBmodel)
testify(x_test,y_test,svmmodel)
testify(x_test,y_test,rfModel)
testify(x_test,y_test,eclfModel)
testify(x_test,y_test,bcModel)


acc: 0.7541666666666667
f1: 0.585855926998333
acc: 0.7416666666666667
f1: 0.560854680675245
acc: 0.7458333333333333
f1: 0.6118666984808717
acc: 0.8791666666666667
f1: 0.6133118506583699
acc: 0.8583333333333333
f1: 0.7013614404918753
acc: 0.8625
f1: 0.6500375591003491
acc: 0.825
f1: 0.696969696969697


### Best model: Random Forest Classifier

In [22]:
best_model=rfModel
y_predict = best_model.predict(x_test)
acc = accuracy_score(y_test,y_predict)
f1 = f1_score(y_test, y_predict, average = 'macro')
print('Best model is Random Forest Classifier')
print('acc:',acc)
print('f1:',f1)

Best model is Random Forest Classifier
acc: 0.8583333333333333
f1: 0.7013614404918753


### Testing with a random tweet by Donald Trump

In [38]:
#test tweet online:
tweet="The Red Hen Restaurant should focus more on cleaning its filthy canopies, doors and windows (badly needs a paint job) rather than refusing to serve a fine person like Sarah Huckabee Sanders. I always had a rule, if a restaurant is dirty on the outside, it is dirty on the inside!"
print(tweet)
test_tweet=np.array([tweet])

The Red Hen Restaurant should focus more on cleaning its filthy canopies, doors and windows (badly needs a paint job) rather than refusing to serve a fine person like Sarah Huckabee Sanders. I always had a rule, if a restaurant is dirty on the outside, it is dirty on the inside!


In [39]:
df = pd.DataFrame(test_tweet )
df.columns=['tweet']

In [40]:
df = script_preprocessing(df)
x_test = TFIDFmatrix(df['pos_lemmatized'],word_features)


In [41]:
y_predict = best_model.predict(x_test)

### This tweet is categorized as hate speech since it is targeted:
The tweet is attacking The Red Hen Restaurant

In [42]:
if int(y_predict)==0:
    print('not targeted')
else: print('targeted')

targeted
