In [1]:
# Taken from https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

#Set Random seed
np.random.seed(500)

# Add the Data using pandas
#Corpus = pd.read_csv(r"./data/corpus.csv",encoding='latin-1')
Corpus = pd.read_csv(r"./Dataset/SixClass/train.csv",encoding='latin-1')
Corpus.head(10)

Unnamed: 0.1,Unnamed: 0,text,label,target
0,38746,#trumprussia sean spicer is a blithering idiot...,Others,4
1,47265,If you call yourself a Christian yet you suppo...,Religion,5
2,36632,Small red lights in dark rooms.,Others,4
3,29064,If u find yourself pouting that no male report...,Notcb,3
4,33884,Messi carried these retards to three consecuti...,Others,4
5,26525,#MKR France Vs Ireland Vs Paleo Pete...LETS RU...,Notcb,3
6,41797,Today's front page of the New York Times. A fu...,Religion,5
7,33241,too bad I'm a size 11 men and got basketball t...,Others,4
8,11082,@_sarahjessiee @Khalil_Perry fuck that dumb as...,Ethnicity,1
9,44179,That Christian woman did the right thing,Religion,5


In [3]:
# Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 1a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

#print(Corpus['text_final'].head())


In [4]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [5]:
# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [6]:
# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [10]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes F1 Score (macro avg) -> ",f1_score(Test_Y, predictions_NB, average='macro')*100)
print("Naive Bayes F1 Score (multi-class) -> ",f1_score(Test_Y, predictions_SVM, average=None)*100)


# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("\nSVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("Logistic Regression F1 Score (macro avg) -> ",f1_score(Test_Y, predictions_NB, average='macro')*100)
print("Logistic Regreassion F1 Score (multi-class) -> ",f1_score(Test_Y, predictions_SVM, average=None)*100)


Naive Bayes Accuracy Score ->  77.73378362641202
Naive Bayes F1 Score (macro avg) ->  76.3315394089895
Naive Bayes F1 Score (multi-class) ->  [96.27009646 96.94454923 86.56826568 56.05431912 65.75949367 94.39834025]

SVM Accuracy Score ->  82.70641667637126
Logistic Regression F1 Score (macro avg) ->  76.3315394089895
Logistic Regreassion F1 Score (multi-class) ->  [96.27009646 96.94454923 86.56826568 56.05431912 65.75949367 94.39834025]


In [8]:
print(classification_report(Test_Y,predictions_NB))
print('Confusion Matrix:\n',confusion_matrix(Test_Y, predictions_NB))

              precision    recall  f1-score   support

           0       0.78      0.97      0.86      1546
           1       0.81      0.92      0.86      1329
           2       0.84      0.81      0.83      1406
           3       0.67      0.43      0.53      1433
           4       0.64      0.56      0.60      1411
           5       0.85      0.96      0.90      1462

    accuracy                           0.78      8587
   macro avg       0.77      0.78      0.76      8587
weighted avg       0.77      0.78      0.76      8587

Confusion Matrix:
 [[1494   12    5   14   13    8]
 [  33 1226    7    7   23   33]
 [  27   48 1138   96   71   26]
 [ 174   92   89  622  328  128]
 [ 178  117   99  170  789   58]
 [  10   11    9   21    5 1406]]


In [9]:
print(classification_report(Test_Y,predictions_SVM))
print('Confusion Matrix:\n',confusion_matrix(Test_Y, predictions_SVM))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1546
           1       0.97      0.97      0.97      1329
           2       0.90      0.83      0.87      1406
           3       0.61      0.52      0.56      1433
           4       0.59      0.74      0.66      1411
           5       0.95      0.93      0.94      1462

    accuracy                           0.83      8587
   macro avg       0.83      0.83      0.83      8587
weighted avg       0.83      0.83      0.83      8587

Confusion Matrix:
 [[1497    2    5   19   23    0]
 [   3 1285    7    8   25    1]
 [   3    5 1173  103  118    4]
 [  52   13   53  743  517   55]
 [   9   11   63  284 1039    5]
 [   0    6    3   61   27 1365]]
