In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
!pip install nltk



In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RANJANA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
news_dataset = pd.read_csv('train.csv')

In [8]:
data_type = news_dataset.dtypes
print(data_type)

id         int64
title     object
author    object
text      object
labels     int64
dtype: object


In [26]:
news_dataset.shape

(5200, 5)

In [27]:
news_dataset.isnull().sum()

id          0
title     122
author    503
text        7
labels      0
dtype: int64

In [28]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [29]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [30]:
print(news_dataset['content'])

0       David Streitfeld Specter of Trump Loosens Tong...
1        Russian warships ready to strike terrorists n...
2       Common Dreams #NoDAPL: Native American Leaders...
3       Daniel Victor Tim Tebow Will Attempt Another C...
4       Truth Broadcast Network Keiser Report: Meme Wa...
                              ...                        
5195    Jody Rosen The Bangladeshi Traffic Jam That Ne...
5196    Sheryl Gay Stolberg John Kasich Signs One Abor...
5197    Mike McPhate California Today: What, Exactly, ...
5198     300 US Marines To Be Deployed To Russian Bord...
5199    Teddy Wayne Awkward Sex, Onscreen and Off - Th...
Name: content, Length: 5200, dtype: object


In [32]:
# separating the data & label
X = news_dataset.drop(columns='labels', axis=1)
Y = news_dataset['labels']

In [33]:
print(X)
print(Y)

         id                                              title  \
0     20800  Specter of Trump Loosens Tongues, if Not Purse...   
1     20801  Russian warships ready to strike terrorists ne...   
2     20802  #NoDAPL: Native American Leaders Vow to Stay A...   
3     20803  Tim Tebow Will Attempt Another Comeback, This ...   
4     20804                    Keiser Report: Meme Wars (E995)   
...     ...                                                ...   
5195  25995  The Bangladeshi Traffic Jam That Never Ends - ...   
5196  25996  John Kasich Signs One Abortion Bill in Ohio bu...   
5197  25997  California Today: What, Exactly, Is in Your Su...   
5198  25998  300 US Marines To Be Deployed To Russian Borde...   
5199  25999  Awkward Sex, Onscreen and Off - The New York T...   

                       author  \
0            David Streitfeld   
1                               
2               Common Dreams   
3               Daniel Victor   
4     Truth Broadcast Network   
...      

In [34]:
port_stem = PorterStemmer()

In [35]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [36]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [38]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['labels'].values

In [39]:
print(X)

['david streitfeld specter trump loosen tongu purs string silicon valley new york time'
 'russian warship readi strike terrorist near aleppo'
 'common dream nodapl nativ american leader vow stay winter file lawsuit polic'
 ... 'mike mcphate california today exactli sushi new york time'
 'us marin deploy russian border norway'
 'teddi wayn awkward sex onscreen new york time']


In [40]:
print(Y)

[1 0 1 ... 0 0 1]


In [41]:
Y.shape

(5200,)

In [42]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [43]:
print(X)

  (0, 9086)	0.08756899768504829
  (0, 8618)	0.2951833364547817
  (0, 8391)	0.11042633230471807
  (0, 8258)	0.35551158241433845
  (0, 8217)	0.08565162006825207
  (0, 7825)	0.35551158241433845
  (0, 7815)	0.3392484573767603
  (0, 7632)	0.35551158241433845
  (0, 7416)	0.29990759064553757
  (0, 6435)	0.35551158241433845
  (0, 5528)	0.0847731814904578
  (0, 4746)	0.35551158241433845
  (0, 1960)	0.20372848702181867
  (1, 8826)	0.4600630644689105
  (1, 8135)	0.35260162515090443
  (1, 7823)	0.36689240142839846
  (1, 6999)	0.30028557064663775
  (1, 6583)	0.40045820302192875
  (1, 5485)	0.37981022352550325
  (1, 186)	0.36689240142839846
  (2, 8964)	0.295064720779198
  (2, 8765)	0.28434502467517675
  (2, 7741)	0.2982257606996168
  (2, 6196)	0.2207192031076798
  (2, 5596)	0.3312058036253688
  :	:
  (5196, 795)	0.21762533251043967
  (5196, 24)	0.2798172726761391
  (5197, 9086)	0.12439214769277508
  (5197, 8237)	0.3546424451054057
  (5197, 8217)	0.1216685043258708
  (5197, 7942)	0.46551291399479516


In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [45]:
model = LogisticRegression()

In [46]:
model.fit(X_train, Y_train)

In [47]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [48]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.8783653846153846


In [49]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [50]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.4913461538461538


In [69]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [67]:
print(Y_test[3])

0
