In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
news_data = pd.read_csv('news_articles.csv')
news_data.head()

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \r\nfox news sunday reported this m...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [5]:
news_data.dropna(subset=['label'], inplace=True)

In [6]:
news_data.shape

(2095, 12)

In [7]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2095 entries, 0 to 2094
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   2095 non-null   object 
 1   published                2095 non-null   object 
 2   title                    2095 non-null   object 
 3   text                     2050 non-null   object 
 4   language                 2095 non-null   object 
 5   site_url                 2095 non-null   object 
 6   main_img_url             2095 non-null   object 
 7   type                     2095 non-null   object 
 8   label                    2095 non-null   object 
 9   title_without_stopwords  2094 non-null   object 
 10  text_without_stopwords   2046 non-null   object 
 11  hasImage                 2095 non-null   float64
dtypes: float64(1), object(11)
memory usage: 212.8+ KB


In [8]:
news_data.isnull().sum()

author                      0
published                   0
title                       0
text                       45
language                    0
site_url                    0
main_img_url                0
type                        0
label                       0
title_without_stopwords     1
text_without_stopwords     49
hasImage                    0
dtype: int64

In [9]:
news_data = news_data.fillna('')

In [10]:
news_data.isnull().sum()

author                     0
published                  0
title                      0
text                       0
language                   0
site_url                   0
main_img_url               0
type                       0
label                      0
title_without_stopwords    0
text_without_stopwords     0
hasImage                   0
dtype: int64

In [11]:
news_data['finalcontent'] = news_data['author']+' '+news_data['title']
news_data['finalcontent']

0       Barracuda Brigade muslims busted they stole mi...
1       reasoning with facts re why did attorney gener...
2       Barracuda Brigade breaking weiner cooperating ...
3       Fed Up pin drop speech by father of daughter k...
4       Fed Up fantastic trumps  point plan to reform ...
                              ...                        
2090    -NO AUTHOR- prof canoes reek of genocide white...
2091    -NO AUTHOR- teens walk free after gangrape con...
2092    -NO AUTHOR- school named for munichmassacre ma...
2093            -NO AUTHOR- russia unveils satan  missile
2094    -NO AUTHOR- check out hillarythemed haunted house
Name: finalcontent, Length: 2095, dtype: object

In [12]:
X = news_data.drop(columns='label',axis=1)
Y = news_data['label']

In [13]:
# print(X)
print(Y)

0       Real
1       Real
2       Real
3       Real
4       Real
        ... 
2090    Real
2091    Real
2092    Real
2093    Fake
2094    Fake
Name: label, Length: 2095, dtype: object


## PreProcessing

In [14]:
le = LabelEncoder()
Y = le.fit_transform(Y)
news_data['label'] = Y

In [15]:
port_stem = PorterStemmer()

In [16]:
def stemming(finalcontent):
  stemmed_content = re.sub('[^a-zA-Z]',' ',finalcontent)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split(' ')

  # for word in stemmed_content:
  #   if not word in stopwords.words('english'):
  #     word = port_stem.stem(word)

  # list comprehension
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [17]:
news_data['finalcontent'].head()

0    Barracuda Brigade muslims busted they stole mi...
1    reasoning with facts re why did attorney gener...
2    Barracuda Brigade breaking weiner cooperating ...
3    Fed Up pin drop speech by father of daughter k...
4    Fed Up fantastic trumps  point plan to reform ...
Name: finalcontent, dtype: object

In [18]:
news_data['finalcontent'] = news_data['finalcontent'].apply(stemming)

In [19]:
news_data['finalcontent'].head()

0    barracuda brigad muslim bust stole million gov...
1    reason fact attorney gener loretta lynch plead...
2    barracuda brigad break weiner cooper fbi hilla...
3    fed pin drop speech father daughter kidnap kil...
4    fed fantast trump  point plan reform healthcar...
Name: finalcontent, dtype: object

In [20]:
X = news_data['finalcontent'].values
Y = news_data['label'].values
print(Y)

[1 1 1 ... 1 0 0]


In [21]:
print(Y)

[1 1 1 ... 1 0 0]


In [22]:
Y.shape

(2095,)

In [23]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

with open('vector_model.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)


In [24]:
X = vectorizer.transform(X)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18414 stored elements and shape (2095, 5205)>
  Coords	Values
  (0, 384)	0.36018680277652093
  (0, 439)	0.36018680277652093
  (0, 588)	0.36018680277652093
  (0, 638)	0.3697652106033948
  (0, 1915)	0.381488234342381
  (0, 2929)	0.2756337034699588
  (0, 3035)	0.30865815877480496
  (0, 4404)	0.39660184056030895
  (1, 292)	0.35712965583089573
  (1, 1574)	0.3500140331264624
  (1, 1647)	0.4022804759810463
  (1, 1848)	0.31651672615411075
  (1, 2702)	0.35712965583089573
  (1, 2737)	0.32840760281518205
  (1, 3446)	0.4022804759810463
  (1, 3723)	0.30118671988245904
  (2, 384)	0.3915149697639707
  (2, 571)	0.31731911702158105
  (2, 588)	0.3915149697639707
  (2, 965)	0.45425147642696156
  (2, 1406)	0.2554544628555447
  (2, 1620)	0.2545826103585902
  (2, 2093)	0.18378977799177154
  (2, 2309)	0.28610861334357734
  (2, 5027)	0.3750868190493317
  :	:
  (2090, 3570)	0.4086148555991827
  (2090, 3581)	0.3890807899658911
  (2090, 3752)	0.438271

Splitting Dataset

In [25]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

Training Model

# Logistic Regression

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train,Y_train)

Evaluation

In [28]:
X_train_Prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_Prediction,Y_train)

In [29]:
print(training_data_accuracy)

0.9373508353221957


In [30]:
X_test_Prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_Prediction,Y_test)

In [31]:
print(testing_data_accuracy)

0.7923627684964201


# Decision Tree Classifier

In [32]:
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)

In [33]:
X_train_Prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_Prediction,Y_train)

In [34]:
print(training_data_accuracy)

1.0


In [35]:
X_test_Prediction = clf.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_Prediction,Y_test)

In [36]:
print(testing_data_accuracy)

0.8353221957040573


# SVM Model

In [37]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, Y_train)

In [38]:
X_train_Prediction = svm_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_Prediction,Y_train)

In [39]:
print(training_data_accuracy)

0.9850835322195705


In [40]:
X_test_Prediction = svm_model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_Prediction,Y_test)

In [41]:
print(testing_data_accuracy)

0.8663484486873508


In [42]:
import pickle
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(svm_model, file)

# K-Neighbours Neighbours

In [43]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, Y_train)

In [44]:
predictions = knn_model.predict(X_test)

In [45]:
X_train_Prediction = knn_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_Prediction,Y_train)

In [46]:
print(training_data_accuracy)

0.8490453460620525


In [47]:
X_test_Prediction = knn_model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_Prediction,Y_test)

In [48]:
print(testing_data_accuracy)

0.7732696897374701
