In [9]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import pickle 

#Set Random seed
np.random.seed(300)

# Add the Data using pandas
Corpus = pd.read_csv(r"cleanded_data.csv",encoding='latin-1')

# Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['headlines'] = [entry.lower() for entry in Corpus['headlines']]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['headlines']= [word_tokenize(entry) for entry in Corpus['headlines']]

# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index,entry in enumerate(Corpus['headlines']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

print(Corpus['text_final'].head())

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],
                                                                    Corpus['outcome'],test_size=0.2)

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features=1000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

# Classifier - Algorithm - Naive Bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

predictions_NB = Naive.predict(Test_X_Tfidf)

#KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 100)
knn.fit(Train_X_Tfidf,Train_Y)
predictions_Knn = knn.predict(Test_X_Tfidf)

#SVM Classifier
from sklearn.svm import SVC
svc = SVC(C=0.1, kernel='linear', gamma='scale')
svc.fit(Train_X_Tfidf,Train_Y)
predictions_svm= svc.predict(Test_X_Tfidf)

# Use accuracy_score function to display the accuracy
Naivescore = accuracy_score(predictions_NB, Test_Y)
Knnscore = accuracy_score(predictions_Knn, Test_Y)
Svmscore = accuracy_score(predictions_svm, Test_Y)

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("KNN Accuracy Score -> ",accuracy_score(predictions_Knn, Test_Y)*100)
print("SVM Bayes Accuracy Score -> ",accuracy_score(predictions_svm, Test_Y)*100)

# pickling the model
tuple_modal = (Naive,Test_Y,Naivescore,Tfidf_vect)
pickle.dump(tuple_modal, open('modal.sav', 'wb'))

0    ['pfizer', 'covid', 'vaccine', 'low', 'men', '...
1    ['vaccine', 'covid', 'longer', 'need', 'put', ...
2    ['vaccine', 'covid', 'longer', 'need', 'put', ...
3    ['human', 'volunteer', 'test', 'covid', 'vacci...
4    ['people', 'die', 'lagos', 'igeria', 'fter', '...
Name: text_final, dtype: object
Naive Bayes Accuracy Score ->  86.46616541353383
KNN Accuracy Score ->  79.69924812030075
SVM Bayes Accuracy Score ->  79.69924812030075
