In [1]:
from collections import defaultdict
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

np.random.seed(0)

## Introduction
This is an extended version of a comprehensive
[medium article](https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34)
on this topic written by Gunjit Bedi.

### Dataset
Amazon Review Data set is stored in ```data/corpus.csv``` and loaded into a pandas dataframe.

In [2]:
corpus = pd.read_csv(os.path.join('data', 'corpus.csv'), encoding='latin-1')
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   label   10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


## Data Preprocessing

In [3]:
# Step - 1a : Remove blank rows if any.
corpus['text'].dropna(inplace=True)

In [4]:
# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
corpus['text'] = [entry.lower() for entry in corpus['text']]

In [5]:
# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
corpus['text']= [word_tokenize(entry) for entry in corpus['text']]

In [6]:
# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word)
    # The final processed set of words for each iteration will be stored in 'text_final'
    corpus.loc[index,'text_final'] = str(final_words)

## Prepare Training

In [7]:
# Step - 2: Split the model into Train and Test Data set
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(corpus['text_final'],corpus['label'],test_size=0.3)

In [8]:
# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
encoder = LabelEncoder()
train_Y = encoder.fit_transform(train_Y)
test_Y = encoder.fit_transform(test_Y)

In [9]:
# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(corpus['text_final'])

train_X_Tfidf = tfidf_vect.transform(train_X)
test_X_Tfidf = tfidf_vect.transform(test_X)


## Classification

In [14]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

In [10]:
# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
naive = naive_bayes.MultinomialNB()
naive.fit(train_X_Tfidf,train_Y)

# predict the labels on validation dataset
predictions_NB = naive.predict(test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, test_Y)*100)


Naive Bayes Accuracy Score ->  82.76666666666667


In [11]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm.fit(train_X_Tfidf, train_Y)

# predict the labels on validation dataset
predictions_SVM = svm.predict(test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_Y)*100)

SVM Accuracy Score ->  86.03333333333333
