In [1]:
#data manipulation
import pandas as pd
import numpy as np
import re
import string

##Machine learning and text processing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from wordcloud import WordCloud


#libraries used for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
#importing the csv file
dftrain=pd.read_csv(r"C:\Users\shaik mahaboob basha\Downloads\Fake-news-project\Fake news project\train_news.csv")
dftrain.head(10)

Unnamed: 0.1,Unnamed: 0,id,headline,written_by,news,label
0,0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0
2,2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0
3,3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1
5,5,3196,Whatever the Outcome on November 8th the US Wi...,,Taming the corporate media beast Whatever the ...,1
6,6,5134,Rapid Evolution Saved This Fish From Pollution...,JoAnna Klein,The State of New Jersey says you can’t eat the...,0
7,7,1504,Alabama Prison Officials Retaliate Against Pri...,Brian Sonenstein,Advocates say prison officials at the Kilby Co...,1
8,8,13559,,steventexas,People have made up their minds on president.\...,1
9,9,4203,Can We Live in a Constant State of Love?,Gillian,Leave a reply \nToni Emerson – When we fall in...,1


In [3]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20800 non-null  int64 
 1   id          20800 non-null  int64 
 2   headline    20242 non-null  object
 3   written_by  18843 non-null  object
 4   news        20761 non-null  object
 5   label       20800 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 731.3+ KB


In [4]:
#shape of the data-set
dftrain.shape

(20800, 6)

In [5]:
#Basic statistical Data
dftrain.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,20800.0,10399.5,6004.587135,0.0,5199.75,10399.5,15599.25,20799.0
id,20800.0,10399.5,6004.587135,0.0,5199.75,10399.5,15599.25,20799.0
label,20800.0,0.500625,0.500012,0.0,0.0,1.0,1.0,1.0


In [6]:
dftrain.describe(include='object').T


Unnamed: 0,count,unique,top,freq
headline,20242,19803,The Dark Agenda Behind Globalism And Open Borders,5
written_by,18843,4201,Pam Key,243
news,20761,20386,,75


In [7]:
#values and count of the target variable
dftrain.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [8]:
# dropping the null-values 
dftrain.dropna(inplace= True)
print(dftrain.shape)
print(dftrain.isnull().sum())

(18285, 6)
Unnamed: 0    0
id            0
headline      0
written_by    0
news          0
label         0
dtype: int64


In [9]:
dftrain = dftrain.drop(["headline", "written_by", "id","Unnamed: 0"], axis=1)
dftrain.head(5)

Unnamed: 0,news,label
0,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,HOUSTON — Venezuela had a plan. It was a ta...,0
2,"Sunday on ABC’s “This Week,” while discussing ...",0
3,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,Finian Cunningham has written extensively on...,1


In [10]:
# cleaning the text data for vectorization
# defining the function
def clean_txt(text):
    text = text.lower() #Converting the text to lower case
    text = re.sub('\[.*?\]','',text) #Replacing email addresses
    text = re.sub('\\W', ' ', text) #Removing Punctuations
    text = re.sub('https?://\S+|www\.S+', '', text)  #Replace URLs with 'webaddress'
    text = re.sub('<.*?>+', '', text) #Removing the HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)#Removing Punctuations
    text = re.sub('\n', '', text) #Removing new lines
    text = re.sub('\w*\d\w*', '', text)
    tokenized_text = word_tokenize(text) #word_tokenization
    stop_words = set(stopwords.words('english') + ['u','ur','im','doin', 'ü', 'â', 'e', 'ur', 'doin', 'ure','READ MORE']) #declaring stop Stop_Words
    WL = WordNetLemmatizer() #declaring lemmatizer
    text = [WL.lemmatize(word) for word in tokenized_text if word not in stop_words if word.isalpha()] # lemmatization and removal of stop_words
    return " " .join(text)

In [11]:
# applying the clean_txt function to the "news" column
dftrain['news'] = dftrain['news'].apply(clean_txt)
dftrain.head(5)

Unnamed: 0,news,label
0,washington sonny perdue telling georgian growi...,0
1,houston venezuela plan tactical approach desig...,0
2,sunday abc week discussing republican plan rep...,0
3,augusta beleaguered republican governor maine ...,0
4,finian cunningham written extensively internat...,1


In [12]:
# vectorizing the text data using TfidfVectorizer 
tfidf = TfidfVectorizer(max_features = 15000)
x = tfidf.fit_transform(dftrain['news'])
y = dftrain['label']

In [13]:
#Creating train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=.25,stratify=y)

In [14]:
# find the shape of x and y
x.shape, y.shape

((18285, 15000), (18285,))

In [15]:
# find the shape of x_train and y_train
x_train.shape, y_train.shape

((13713, 15000), (13713,))

In [16]:
# importing machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Importing error metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc
from sklearn.model_selection import GridSearchCV,cross_val_score

In [17]:
# Intializing the model
RF = RandomForestClassifier()
LR = LogisticRegression()
DT = DecisionTreeClassifier()
GBC = GradientBoostingClassifier()
MNB = MultinomialNB()

# appending models with their respective declarations
models= []
models.append(('RandomForestClassifier', RF))
models.append(('LogisticRegression',LR))
models.append(('GradientBoostingClassifier',GBC))
models.append(('DecisionTreeClassifier',DT))
models.append(('MultinomialNB', MNB))

# Creating empty list
Model=[]
score=[]
cvs=[]
roc_auc_score=[]
Precision=[]

# creating a loop to run the data through the models
for name,model in models:

    # model fitting
    Model.append(name)
    model.fit(x_train,y_train)
    print(model)
    pre=model.predict(x_test)
    print('\n')
    # accuracy score
    aucc_score=accuracy_score(y_test,pre)
    print('accuracy_score: ',aucc_score)
    score.append(aucc_score*100)
    print('\n')
    # cross-validation score
    cv_score=cross_val_score(model,x,y,cv=10,scoring='roc_auc').mean()
    print('Cross Val Score : ', cv_score)
    cvs.append(cv_score*100)
    print('\n')
    # classification report
    print('classification_report\n',classification_report(y_test,pre))
    # roc_auc
    false_positive_rate,true_positive_rate, thresholds=roc_curve(y_test,pre)
    roc_auc=auc(false_positive_rate, true_positive_rate)
    print('roc auc score : ', roc_auc)
    roc_auc_score.append(roc_auc*100)
    print('\n')
    # confusion matrix
    print('Confusion Matrix:\n',confusion_matrix(y_test,pre))
    print('\n')
    print("..........................................................")
    print('\n')

RandomForestClassifier()


accuracy_score:  0.9302274715660542


Cross Val Score :  0.987944659501735


classification_report
               precision    recall  f1-score   support

           0       0.91      0.97      0.94      2591
           1       0.96      0.88      0.92      1981

    accuracy                           0.93      4572
   macro avg       0.94      0.92      0.93      4572
weighted avg       0.93      0.93      0.93      4572

roc auc score :  0.9237634992872271


Confusion Matrix:
 [[2519   72]
 [ 247 1734]]


..........................................................


LogisticRegression()


accuracy_score:  0.9431321084864392


Cross Val Score :  0.9890506405905395


classification_report
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      2591
           1       0.93      0.93      0.93      1981

    accuracy                           0.94      4572
   macro avg       0.94      0.94      0.94      4572
wei

In [18]:
scores=pd.DataFrame({'Model': Model,'Accuracy Score': score,'Cross Val Score':cvs,'Roc_Auc_curve':roc_auc_score})

In [19]:
scores

Unnamed: 0,Model,Accuracy Score,Cross Val Score,Roc_Auc_curve
0,RandomForestClassifier,93.022747,98.794466,92.37635
1,LogisticRegression,94.313211,98.905064,94.204203
2,GradientBoostingClassifier,91.951006,98.134386,91.935857
3,DecisionTreeClassifier,87.882765,88.421198,87.692866
4,MultinomialNB,89.370079,97.386897,88.315804
