## Data Modelling using Supervised learning - SVM & Naive Bayes Classifier
_________________________________________________


In [1]:
# Importing Dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Importing dependencies for vectorization and ML/DL
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [3]:
# Reading cleaned file with text for model creation
# news_csv = Path('/content/drive/MyDrive/nlp_cleaned_news.csv')
news_csv = Path('nlp_cleaned_news.csv')
news_df = pd.read_csv(news_csv)

In [4]:
# Dropping extra column
news_df = news_df.drop(['Unnamed: 0'], axis=1)
news_df.head()

Unnamed: 0,text,class
0,budget fight loom republicans flip fiscal scri...,1
1,military accept transgender recruit monday pen...,1
2,senior republican senator let mr mueller job w...,1
3,fbi russia probe help australian diplomat tip ...,1
4,trump want postal service charge amazon shipme...,1


In [5]:
# Dropping null rows
news_df.dropna(inplace = True)
news_df.count()

text     44678
class    44678
dtype: int64

### Splitting data into Train and Test and Vectorization
Target  = news_df['class'], Feature = news_df['text']


In [6]:
# Preparing Train and Test Data sets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(news_df['text'],news_df['class'], random_state=42)

In [7]:
# Word Vectorization

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(news_df['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [8]:
# Check Vectorized text
print(Tfidf_vect.vocabulary_)



In [10]:
print(Train_X_Tfidf)

  (0, 4996)	0.04230605118651004
  (0, 4974)	0.016532816545229232
  (0, 4944)	0.03584928075125761
  (0, 4943)	0.020957271273067995
  (0, 4880)	0.021117770278464704
  (0, 4879)	0.02540968315259541
  (0, 4843)	0.02761601675501945
  (0, 4842)	0.020506740642218926
  (0, 4824)	0.04790264400477398
  (0, 4819)	0.04004275476710438
  (0, 4725)	0.04648322336890898
  (0, 4657)	0.027311657373322504
  (0, 4655)	0.09212836990956026
  (0, 4654)	0.3351900096090298
  (0, 4636)	0.04033981697479623
  (0, 4579)	0.03715980235961014
  (0, 4536)	0.031532863857601914
  (0, 4533)	0.05151436190283342
  (0, 4531)	0.03565998834227472
  (0, 4528)	0.032680931595486566
  (0, 4502)	0.03398754450081242
  (0, 4498)	0.11254941861264751
  (0, 4493)	0.02938424670236177
  (0, 4461)	0.05130183544018545
  (0, 4454)	0.03202714940773645
  :	:
  (33507, 1382)	0.24314206303210478
  (33507, 1321)	0.0678267927305061
  (33507, 1301)	0.043527438760573176
  (33507, 1275)	0.04955460776295754
  (33507, 1250)	0.06337707700945819
  (33507

### SVM Model

In [11]:
# Classifier - Algorithm - SVM

# fitting the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# Predicting the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  99.4449418084154


In [13]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(Test_Y, predictions_SVM,
                            target_names=["Fake", "Real"]))

              precision    recall  f1-score   support

        Fake       1.00      0.99      0.99      5819
        Real       0.99      0.99      0.99      5351

    accuracy                           0.99     11170
   macro avg       0.99      0.99      0.99     11170
weighted avg       0.99      0.99      0.99     11170



In [14]:
# Pickling the model
import pickle
pickle.dump(SVM,open('model_svm.pkl', 'wb'))

### Naive Bayes Model

In [15]:
# Classifier - Algorithm - Naive Bayes

# fitting the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predicting the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  92.8379588182632


In [16]:
# Calculate classification report
print(classification_report(Test_Y, predictions_NB,
                            target_names=["Fake", "Real"]))

              precision    recall  f1-score   support

        Fake       0.93      0.94      0.93      5819
        Real       0.93      0.92      0.92      5351

    accuracy                           0.93     11170
   macro avg       0.93      0.93      0.93     11170
weighted avg       0.93      0.93      0.93     11170



In [17]:
# Pickling the model
pickle.dump(Naive,open('model_naive.pkl', 'wb'))

### Testing SVM model further since it has better accuracy and recall

In [18]:
# load the SVM model from disk
import pickle
loaded_model = pickle.load(open('model_svm.pkl', 'rb'))

In [21]:
# Function for data cleaning
import spacy
nlp = spacy.load(r"C:\Users\jyots\Downloads\en_core_web_sm-3.5.0\en_core_web_sm\en_core_web_sm-3.5.0")

def data_cleaning(text):

    # changing to lower case
    text = text.lower()

    # Getting document ready for NLP tasks
    doc = nlp(text)

    # Empty list for storing cleaned data
    clean_text = ""

    # Remove stop words and lemmatize
    lemmas = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Join the lemmas back into a string
    clean_text = ' '.join(lemmas)

    return(clean_text)

In [22]:
Test_X

2362     trump weigh son russia attorney statement whit...
842      house approve funding child healthcare program...
28897    nra commercial stupid melt brain video day pla...
2813     senior russian diplomat meet washington week w...
6312     new north dakota governor expect controversial...
                               ...                        
20294    cambodian opposition party boycott parliament ...
15192    hungry south sudanese refugee risk death retur...
18553    bombast north korea genteel foreign minister s...
9087     senior house democrat expect russia target ins...
18627    turkey call citizen leave northern iraq flight...
Name: text, Length: 11170, dtype: object

#### Test inputs - Choose one and run before running input preperation and prediction

In [23]:
# True
check = "WASHINGTON (Reuters) - The U.S. House of Representatives on Friday approved legislation to continue a federal insurance program for millions of lower-income children and pregnant women, but with an ongoing funding battle it could be weeks before the program gets more money."

In [34]:
# Fake
check = "This NRA Commercial Is So Stupid It Will Melt Your Brain (VIDEO) A few days after the Planned Parenthood shooting in Colorado Springs that left three dead and twelve injured, the National Rifle Association (NRA) released the dumbest pro-gun commercial likely ever seen. In the months since the commercial was released and in the aftermath of more mass shootings, it has popped up on my television screen multiple times. Every single time I see it my brain melts a l"

In [30]:
# True
check= "BERLIN (Reuters) U.S. President Donald Trump is undermining international stability with his decision to recognise Jerusalem as Israel s capital and move the U.S. embassy there, the leader of Germany s Social Democrats (SPD) said on Wednesday. Affirming his support for a two-state solution for Israelis and Palestinians, Martin Schulz said Trump s decision, taken despite warnings from a wide range of U.S. allies, risked setting back the peace process in the Middle East. Trump is due to announce later on Wednesday that the United States recognises Jerusalem as the capital of Israel and will move its embassy there, breaking with longtime U.S. policy and possibly stirring unrest."

In [36]:
# True
check="WASHINGTON (Reuters) - U.S. President Donald Trump said he will meet on Tuesday with the Democratic leaders of the U.S. Senate and House of Representatives, Chuck Schumer and Nancy Pelosi, to discuss keeping the government open but cited differences with them. â€œMeeting with â€œChuck and Nancyâ€ today about keeping government open and working. Problem is they want illegal immigrants flooding into our Country unchecked, are weak on Crime and want to substantially RAISE Taxes. I donâ€™t see a deal!â€ Trump said in a Twitter post. "

In [42]:
# Fake
check="President Trump spoke at the first ever Celebrate Freedom Rally last night delivering a barn burner of a speech to veterans and wounded warriors from Walter Reid Hospital. It was one of the best  red meat  speeches our President has ever delivered. We think you ll enjoy it Go to the 21:45 point for the amazing story of Harry F. Miller:Harry lied about his age to join the American forces. He was just 15 when he joined the US military during World War II. Harry was a US hero at The Battle of the Bulge!Harry s story:During the battle, the 1st Army Headquarters instructed the tank crews to go down to an ordnance depot and take whatever they needed for the tanks.  We had to take good parts off of one tank and put it on another. We finally got three tanks and a tank destroyer that would operate and run, and had a gun,  he said in an interview with VA. They took the three working tanks and sent them to their C Company. Those three tanks ended up taking out three German tanks, which ended up being members of the 1st SS Panzer Division Leibstandarte SS Adolf Hitlerbeing, Hitler s old body guards."

In [54]:
# Fake
check="President Donald Trump told televangelist Pat Robertson in a new interview that he will make it safe to say  Merry Christmas You ll be saying Merry Christmas again soon Pat Robertson, the  700 Club  host and Christian Broadcasting Network founder, asked President Trump if he is going to  take on heavyweights.  The president said that he doesn t want to take on lightweights. He then turned to the topic of religious liberty: We have to bring our country back. Our country was going in the wrong direction. And by the way, what they were doing to religious liberty: They were destroying religious liberty. He then added this firm statement on the effort to bring back Christian liberty in America:  You will be saying  Merry Christmas  again very soon Robertson responded:  We ll count on it. President Trump previously stated that Democrats had been taking the wrong steps to improve the lives of all Americans: You couldn t build, you couldn t do anything One of the focuses of the Trump agenda has been stripping away regulations so Americans can be more productive. In addition, the president believes in stripping away all of the supposed politically correct phrases including  Happy Holidays . He s drawn a red line on those who insist on shaming anyone who chooses to say  Merry Christmas .While on the campaign trail, candidate Trump criticized the use of  Happy Holidays  as a substitute for  Merry Christmas .  His support among Evangelical Christians has remained high, with many including Robertson praising his appointment of Neil Gorsuch to the Supreme Court."

#### Calling data cleaning function with test data for data preperation and prediction

In [61]:
clean_text = data_cleaning(check)

In [62]:
# Displaying clean text
clean_text 

'united nations food agency world price index fall october low level year drive decline sugar cereal vegetable oil meat food agriculture organization price index track globally trade food commodity average point october previous month agency say report friday'

In [63]:
# Converting clean text to series
clean_text_ser = pd.Series(clean_text)

In [64]:
# Displaying series
clean_text_ser

0    united nations food agency world price index f...
dtype: object

In [65]:
# Vectorizing clean data
vectorized_input_data = Tfidf_vect.transform(clean_text_ser)
# Predicting outcome
prediction = loaded_model.predict(vectorized_input_data)
# Printing outcome based on predicted value
if prediction[0] == 0:
    print("Prediction of the News :  Fake")
else:
    print("Prediction of the News : Real")

Prediction of the News :  Fake


In [29]:
# Saving test data for future use
Test_X.to_csv("test_X2")