# A guide to Text Classification(NLP) using SVM and Naive Bayes with Python

-  [Step 1](#step-1)
-  [Step 2](#step-2)
-  [Step 3](#step-3)
-  [Step 4-6](#step-4---6)
-  [Step 7](#step-7)

# Step 1
### install required packages

In [30]:
#if you don't have the required packages, install them by removing the # from the following lines
#%pip install subprocess
#%pip install pkg_resources


In [31]:
#check if the required packages are installed, run the following code
import subprocess
import pkg_resources
import sys
packages = ['pandas', 'numpy', 'nltk', 'scikit-learn', 'Pyarrow']
for package in packages:
    try:
        dist = pkg_resources.get_distribution(package)
        print(f'{package} is installed')
    except pkg_resources.DistributionNotFound:
        print(f'{package} is NOT installed')
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])


pandas is installed
numpy is installed
nltk is installed
scikit-learn is installed
Pyarrow is installed


In [32]:
#import packages
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# Step 2
### Add the Data using pandas

In [33]:
#Set Random seed
np.random.seed(500)

In [34]:
# Add the Data using pandas
df = pd.read_csv('./corpus.csv',encoding='latin-1')
Corpus = df.copy()
Corpus.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2


# Step 3  
### Data Pre-Processing

In [35]:
#!pip install nltk

In [36]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
!python -m nltk.downloader averaged_perceptron_tagger

/Users/axu/opt/anaconda3/bin/python: Error while finding module specification for 'nltk.downloader' (ModuleNotFoundError: No module named 'nltk')


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


In [37]:
# Step - 3: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 3a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

# Step - 3b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
#Corpus['text'] = list(map(str, Corpus['text']))
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

# Step - 3c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

# Step - 3d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

#print(Corpus['text_final'].head())

In [40]:
print(Corpus['text'].head())

0    [stuning, even, for, the, non-gamer, :, this, ...
1    [the, best, soundtrack, ever, to, anything, .,...
2    [amazing, !, :, this, soundtrack, is, my, favo...
3    [excellent, soundtrack, :, i, truly, like, thi...
4    [remember, ,, pull, your, jaw, off, the, floor...
Name: text, dtype: object


In [46]:
#count how many words there are in the text column
print(f'Number of words in the text column: {Corpus["text"].apply(len).sum()}')
print(f'Number of words in the text_final column: {Corpus["text_final"].apply(len).sum()}')
#sum(len(item) for item in Corpus['text'])
#sum(len(item) for item in Corpus['text_final'])
#Corpus['text'].count()

Number of words in the text column: 923113
Number of words in the text_final column: 3684037


3684037

# Step 4 - 6
### prepare the data for modelling

In [26]:
# Step - 4: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

# Step - 5: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Step - 6: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [51]:
print(Train_X_Tfidf.shape)
print(Test_X_Tfidf.shape)

(350, 4519)
(150, 4519)


# Step 7
### testing different models for accuracy

In [17]:
# Step - 7: Now we can run different algorithms to classify out data check for accuracy

#for model running time, we will use the time package
import time

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier

Naive = naive_bayes.MultinomialNB()

# Measure the start time
start_time = time.time()
Naive.fit(Train_X_Tfidf,Train_Y)
# Measure the end time after training
end_time = time.time()

training_runtime_Naive = time.time() - start_time

print(f"Training time for Naive: {training_runtime_Naive} seconds")

# predict the labels on validation dataset

# Measure the start time for predictions
start_time = time.time()
predictions_NB = Naive.predict(Test_X_Tfidf)
# Measure the end time for predictions
end_time = time.time()
# Use accuracy_score function to get the accuracy
prediction_runtime_Naive = end_time - start_time
print(f"Prediction time for Naive: {prediction_runtime_Naive} seconds")
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# Measure the start time
start_time = time.time()
SVM.fit(Train_X_Tfidf,Train_Y)
# Measure the end time after training
end_time = time.time()
training_runtime_SVM = time.time() - start_time
print(f"Training time for SVM: {training_runtime_SVM} seconds")
# Measure the start time
start_time = time.time()
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Measure the end time after training
end_time = time.time()
prediction_runtime_SVM = time.time() - start_time
print(f"Prediction time for SVM: {prediction_runtime_SVM} seconds")
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


Training time for Naive: 0.0029108524322509766 seconds
Prediction time for Naive: 0.0007252693176269531 seconds
Naive Bayes Accuracy Score ->  80.0
Training time for SVM: 0.03368091583251953 seconds
Prediction time for SVM: 0.01083993911743164 seconds
SVM Accuracy Score ->  84.0
