About the Dataset:

id: unique id for a news article
title: the title of a news article
author: author of the news article
text: the text of the article; could be incomplete
label: a label that marks the article as potentially unreliable
1: unreliable(Fake)
0: reliable(not Fake)



:

# **Importing necessary libraries**

In [72]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# **Data Pre-processing**

In [30]:
df = pd.read_csv('/content/train.csv', lineterminator='\n')


In [32]:
df.shape

(20800, 5)

In [33]:
# print the first 5 rows of the dataframe
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [34]:
# counting the number of missing values in the dataset
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [35]:
df2 = df.fillna('')

In [36]:
# merging the author name and news title
df2['content'] = df2['author']+' '+df2['title']

In [37]:
df2.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [38]:
# separating the data & label
X = df2.drop(columns='label', axis=1)
Y = df2['label']

In [39]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

# Stemming:

# Stemming is the process of reducing a word to its Root word



In [53]:
port_stem = PorterStemmer()

In [54]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)#removes all those things which are not alphabets
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [55]:
df2['content'] = df2['content'].apply(stemming)

In [56]:
print(df2['content'])

0        darrel lucu hou dem aid even see comey letter ...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exerci b...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [57]:
#separating the data and label
X = df2['content'].values
Y = df2['label'].values

In [58]:
print(X)

['darrel lucu hou dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exerci balkan'
 'david swanson keep f aliv']


In [59]:
print(Y)

[1 0 1 ... 0 1 1]


In [60]:
Y.shape

(20800,)

# **Tfid Vectorizer:**



1. **Tokenization:**
   - The first step is tokenization, where each document (string) is split into individual words or tokens. For example, the sentence "This is a sample sentence" might be tokenized into ['This', 'is', 'a', 'sample', 'sentence'].

2. **Counting Term Frequencies (TF):**
   - Next, the vectorizer counts the frequency of each token (word) in each document. This count represents the Term Frequency (TF) component. TF is a measure of how often a word occurs in a document.

3. **Inverse Document Frequency (IDF):**
   - After calculating TF for each word in each document, the vectorizer calculates the Inverse Document Frequency (IDF). IDF measures the importance of a word in the entire corpus (collection of documents). Words that occur frequently across documents are given lower IDF values, while rare words are given higher IDF values.

4. **TF-IDF Calculation:**
   - The TF-IDF value for each word in each document is then computed using the formula:
     TF-IDF(word, document) = TF(word, document) * IDF(word)

 - This calculation combines the local importance of a word in a document (TF) with its global importance across documents (IDF). Words that are common in a specific document but rare in the overall corpus will have higher TF-IDF values.

5. **Vectorization:**
   - Finally, the TF-IDF values for each word in each document are organized into a matrix representation, where each row corresponds to a document and each column corresponds to a word (token).
   - The resulting matrix is often sparse, meaning most entries are zero because most words are not present in most documents. This sparse matrix efficiently represents the textual data in numerical form.



In [61]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()# tf = term frequency ,idf = inverse document frequency
vectorizer.fit(X)

X = vectorizer.transform(X)

In [50]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

**Splitting the dataset to training & test data**

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [84]:
print(X_test)

  (0, 12678)	0.29107782729348297
  (0, 9730)	0.30786337013252507
  (0, 7589)	0.22945562970572514
  (0, 6744)	0.1608801854194978
  (0, 6223)	0.2882572087812303
  (0, 5876)	0.2882572087812303
  (0, 5172)	0.2131649612475011
  (0, 4301)	0.3250119504189488
  (0, 3362)	0.33019724434846126
  (0, 2933)	0.2453491148373786
  (0, 1651)	0.3037338874695836
  (0, 896)	0.21351305870993978
  (0, 235)	0.34298179151837543
  (1, 16853)	0.09117761343372983
  (1, 15164)	0.08946281236254729
  (1, 13919)	0.42524648908354634
  (1, 13065)	0.36773046084789346
  (1, 12618)	0.24868518461414146
  (1, 12163)	0.3796661151115819
  (1, 11925)	0.37327055071909065
  (1, 10216)	0.08813410128297053
  (1, 8729)	0.42524648908354634
  (1, 3970)	0.23098933893199997
  (1, 3306)	0.2834482751186189
  (2, 16725)	0.34432065054490785
  :	:
  (4158, 16639)	0.2796201559111399
  (4158, 13827)	0.2713157796360236
  (4158, 13791)	0.29432702492466434
  (4158, 13648)	0.21479309786895925
  (4158, 13473)	0.2496550139728973
  (4158, 10965)	0.

In [85]:
print(Y_test)

[1 0 1 ... 1 1 0]


**Training the Model: Logistic Regression**

In [63]:
model = LogisticRegression()

In [64]:
model.fit(X_train, Y_train)

**Evaluation**

**accuracy score**

In [65]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [66]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9866586538461538


In [67]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [68]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9790865384615385


**Making a Predictive System**

In [97]:
def preprocess_input(text_input):
    stemmed_input = stemming(text_input)
    return stemmed_input

def predict_news(text_input):
    preprocessed_input = preprocess_input(text_input)


    X_new = vectorizer.transform([preprocessed_input])


    prediction = model.predict(X_new)


    return prediction[0]


user_input = input('Enter the news author and title : ')


prediction_result = predict_news(user_input)


if prediction_result == 0:
    print('The news is Real')
else:
    print('The news is Fake')


Enter the news text: FLYNN: Hillary Clinton Big Woman on Campus - Breitbart  Daniel J. Flynn
The news is Real


In [98]:
def preprocess_input(text_input):
    stemmed_input = stemming(text_input)
    return stemmed_input

def predict_news(text_input):
    preprocessed_input = preprocess_input(text_input)


    X_new = vectorizer.transform([preprocessed_input])


    prediction = model.predict(X_new)


    return prediction[0]


user_input = input('Enter the news author and title : ')


prediction_result = predict_news(user_input)


if prediction_result == 0:
    print('The news is Real')
else:
    print('The news is Fake')


Enter the news author and title : House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It Darrell Lucus
The news is Fake


**Predicting labels of  test data**

In [100]:
df3 = pd.read_csv("/content/test.csv")

In [101]:
df3.shape

(5200, 4)

In [102]:
df3.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [103]:
df3.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [104]:
df3 = df3.fillna('')

In [105]:
df3['content'] = df3['author']+df3['title']

In [106]:
df3.head()

Unnamed: 0,id,title,author,text,content
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",David StreitfeldSpecter of Trump Loosens Tongu...
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,Common Dreams#NoDAPL: Native American Leaders ...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",Daniel VictorTim Tebow Will Attempt Another Co...
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Truth Broadcast NetworkKeiser Report: Meme War...


In [107]:
# Preprocess the test data
df3['content'] = df3['content'].apply(stemming)

# Vectorize the preprocessed test data
X_test = vectorizer.transform(df3['content'])

# Apply the model to make predictions
predictions = model.predict(X_test)

# Add predictions to the DataFrame
df3['predicted_label'] = predictions

# Display the results
print(df3[['content', 'predicted_label']])

                                                content  predicted_label
0     david streitfeldspect trump loosen tongu purs ...                0
1     russian warship readi strike terrorist near al...                1
2     common dream nodapl nativ american leader vow ...                1
3     daniel victortim tebow attempt anoth comeback ...                0
4         truth broadcast networkkeis report meme war e                1
...                                                 ...              ...
5195  jodi rosenth bangladeshi traffic jam never end...                0
5196  sheryl gay stolbergjohn kasich sign one abort ...                0
5197  mike mcphatecalifornia today exactli sushi new...                0
5198              us marin deploy russian border norway                1
5199      teddi wayneawkward sex onscreen new york time                0

[5200 rows x 2 columns]
