In [7]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [8]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [9]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [10]:
# load the dataset
true_news = pd.read_csv(r"C:\Users\T.B\Downloads\True.csv")

fake_news = pd.read_csv(r"C:\Users\T.B\Downloads\Fake.csv")

In [11]:
# merging the datasets
true_news["label"] = 1
fake_news["label"] = 0
news_dataset = pd.concat([true_news, fake_news], axis=0)

# shuffle the dataset
news_dataset = news_dataset.sample(frac=1)

# reset the index
news_dataset.reset_index(inplace=True, drop=True)


In [12]:
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Japan's Aso says will seek U.S. understanding ...,TOKYO (Reuters) - Finance Minister Taro Aso sa...,politicsNews,"January 24, 2017",1
1,"Factbox: Trump on Twitter (Oct 20) - Tax cut, ...",The following statements were posted to the ve...,politicsNews,"October 20, 2017",1
2,JOHN BOLTON GETS IT: ‘This is the First Attemp...,Ambassador Bolton gets it! He and so many othe...,politics,"Dec 5, 2017",0
3,SUNDAY SCREENING: Counter Intelligence – ‘The ...,21st Century Wire says Our weekly documentary ...,Middle-east,"January 29, 2017",0
4,"About 30 killed when train derails, catches fi...",KINSHASA (Reuters) - About 30 people were kill...,worldnews,"November 13, 2017",1


In [13]:
news_dataset.shape

(44898, 5)

In [14]:
news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [15]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [16]:
news_dataset.describe()

Unnamed: 0,label
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [17]:
# Spliting features and target
X = news_dataset.drop(columns="label", axis=1)
Y = news_dataset["label"]



In [18]:
print(X)

                                                   title  \
0      Japan's Aso says will seek U.S. understanding ...   
1      Factbox: Trump on Twitter (Oct 20) - Tax cut, ...   
2      JOHN BOLTON GETS IT: ‘This is the First Attemp...   
3      SUNDAY SCREENING: Counter Intelligence – ‘The ...   
4      About 30 killed when train derails, catches fi...   
...                                                  ...   
44893  Leaked memo fuels accusations of ethnic bias i...   
44894   BREAKING: Active Shooter Reported Outside Los...   
44895  BUILD THE WALL! How Terrorists Have Been Comin...   
44896   Trump’s Revolting ‘Hot Mic’ Moment Has Just B...   
44897  US BOOTS: US Marines Deployed For Ground Comba...   

                                                    text       subject  \
0      TOKYO (Reuters) - Finance Minister Taro Aso sa...  politicsNews   
1      The following statements were posted to the ve...  politicsNews   
2      Ambassador Bolton gets it! He and so many othe... 

In [19]:
print(Y)

0        1
1        1
2        0
3        0
4        1
        ..
44893    1
44894    0
44895    0
44896    0
44897    0
Name: label, Length: 44898, dtype: int64


In [20]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content



In [21]:
import time
import swifter

if 'content' not in news_dataset.columns:
	news_dataset['content'] = news_dataset['title'].fillna('') + ' ' + news_dataset['text'].fillna('')

start = time.time()
news_dataset['content'] = news_dataset['content'].swifter.apply(stemming)
elapsed = time.time() - start
print(f"Stemming completed in {elapsed:.2f} seconds")


Pandas Apply: 100%|██████████| 44898/44898 [1:24:47<00:00,  8.83it/s]  


Stemming completed in 5418.26 seconds


In [22]:
print(news_dataset["content"])

0        japan aso say seek u understand tpp benefit to...
1        factbox trump twitter oct tax cut unit kingdom...
2        john bolton get first attempt coup etat americ...
3        sunday screen counter intellig strategi tensio...
4        kill train derail catch fire congo kinshasa re...
                               ...                        
44893    leak memo fuel accus ethnic bia afghan govern ...
44894    break activ shooter report outsid lo angel pol...
44895    build wall terrorist come across border year v...
44896    trump revolt hot mic moment blown campaign gop...
44897    us boot us marin deploy ground combat iraq def...
Name: content, Length: 44898, dtype: object


In [23]:
X = news_dataset["content"].values
Y = news_dataset["label"].values

In [24]:
# vectorization
vectorizer = TfidfVectorizer()
vectorizer.fit(X) 
X = vectorizer.transform(X)   


In [25]:
# spliting the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [26]:
# training the model
model = LogisticRegression()

In [27]:
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [28]:
# evaluation
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy score of the test data : ", test_data_accuracy)

Accuracy score of the test data :  0.9871937639198218


In [29]:
#confusion matrix
cm = confusion_matrix(Y_test, X_test_prediction)
print("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[4623   76]
 [  39 4242]]


In [30]:
# predict a new news article
X_new = ["The economy is improving and stock markets are up."]
X_new = vectorizer.transform(X_new)
prediction = model.predict(X_new)
print(prediction)

if prediction[0] == 0:
    print("The news is Fake")
else:
    print("The news is Real")


[0]
The news is Fake
