In [62]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
import pandas as pd

file_path = '/content/drive/My Drive/News Detection/fakenews_Dataset.csv'
df = pd.read_csv(file_path)

In [64]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [65]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [66]:
df.shape

(72134, 4)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [68]:
df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [69]:
df = df.fillna('')

In [70]:
df.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [71]:
df.shape

(72134, 4)

In [72]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [73]:
df = df.drop(['Unnamed: 0', 'text'], axis = 1)

In [74]:
df.head()

Unnamed: 0,title,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1


In [75]:
#stemming
ps = PorterStemmer()

def stemming(title):
  stemmed_title = re.sub('[^a-zA-Z]',' ',title)
  stemmed_title = stemmed_title.lower()
  stemmed_title = stemmed_title.split()
  stemmed_title = [ps.stem(word) for word in stemmed_title if not word in stopwords.words('english')]
  stemmed_title = ' '.join(stemmed_title)
  return stemmed_title

In [76]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [77]:
df['title'] = df['title'].apply(stemming)

In [78]:
df['title']

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object

In [79]:
X = df['title'].values
y = df['label'].values

In [80]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [81]:
print(X)

  (0, 19106)	0.19134939529376566
  (0, 18648)	0.1297506867782943
  (0, 17363)	0.2542650376115143
  (0, 17260)	0.24871262252022117
  (0, 9699)	0.22829788917209384
  (0, 7887)	0.26746434949988324
  (0, 6730)	0.48553136502134386
  (0, 6425)	0.28932771754845743
  (0, 5509)	0.31820565801047196
  (0, 3679)	0.24871262252022117
  (0, 1802)	0.33473541566384035
  (0, 407)	0.3190180925014663
  (2, 18648)	0.13443733492985524
  (2, 18034)	0.35962437110547785
  (2, 16446)	0.1999703023632961
  (2, 15094)	0.1609967301122813
  (2, 14591)	0.3580030298678158
  (2, 13591)	0.22687620695463123
  (2, 12744)	0.27904818164471595
  (2, 12011)	0.16878852994653004
  (2, 11864)	0.2231406266784195
  (2, 8020)	0.2692285294185893
  (2, 6880)	0.2652283770602196
  (2, 2919)	0.3639616996972358
  (2, 2673)	0.30809679188606154
  :	:
  (72130, 1768)	0.49293214478810593
  (72130, 764)	0.39870380407772993
  (72131, 17617)	0.39738745004026604
  (72131, 14186)	0.3534104623564766
  (72131, 14182)	0.32391500471431983
  (72131, 1

In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

In [83]:
X_train.shape

(57707, 19639)

In [84]:
X_test.shape

(14427, 19639)

In [85]:
Y_train.shape

(57707,)

In [86]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [87]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.9199403885143916


In [88]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.900603035974215


In [89]:
input_data = X_test[0]
prediction = model.predict(input_data)

In [90]:
if prediction[0] == 0:
    print('The News Is Fake')
else:
    print('The News is Real')

The News is Real


In [91]:
df['title'][0]

'law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'

In [92]:
import pickle

In [93]:
pickle.dump(vector, open('vector.pkl', 'wb'))

In [94]:
pickle.dump(model, open('model.pkl', 'wb'))

In [96]:
vector_form = pickle.load(open('vector.pkl', 'rb'))

In [98]:
load_model = pickle.load(open('model.pkl', 'rb'))

In [99]:
def fake_news(news):
  news = stemming(news)
  input_data = [news]
  vector_form1 = vector_form.transform(input_data)
  prediction = load_model.predict(vector_form1)
  return prediction

In [103]:
val = fake_news("""Bobby Jindal, raised Hindu, uses story of Christian conversion to woo evangelicals for potential 2016 bid""")

In [104]:
if val == [0]:
  print("The news is Fake")
else:
  print('The News is Real')

The news is Fake
