In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-detection/True.csv
/kaggle/input/fake-news-detection/Fake.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer




In [3]:
true_news = pd.read_csv('/kaggle/input/fake-news-detection/True.csv')
fake_news = pd.read_csv('/kaggle/input/fake-news-detection/Fake.csv')

In [4]:
true_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In above datasets there is no traget attribute which is important and we'll need that for testing in later part of this project, so lets add target attribute

when the news is true we'll represent it as 1

when the news is false we'll represent it as 0

In [6]:
true_news['target'] = 1
fake_news['target'] = 0

In [7]:
print('true news shape:', true_news.shape)
print('fake news shape:', fake_news.shape)

true news shape: (21417, 5)
fake news shape: (23481, 5)


As we have different dataset for true and fake news, the next thing we can do is concat both data set and 

In [8]:
news_data = pd.concat([true_news,fake_news],axis = 0)
news_data.head()

Unnamed: 0,title,text,subject,date,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [9]:
df = news_data.drop(['title','subject','date'],axis =1 )
df.head()

Unnamed: 0,text,target
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


### Text Data Preprocessing

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # will sub "[?]" with a blank str in texts 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text) # words containing digit
    return text

df['text'] = df['text'].apply(lambda x : clean_text(x))

In [11]:
df['text'].head()[0]

'washington reuters  the head of a conservative republican faction in the us congress who voted this month for a huge expansion of the national debt to pay for tax cuts called himself a “fiscal conservative” on sunday and urged budget restraint in  in keeping with a sharp pivot under way among republicans us representative mark meadows speaking on cbs’ “face the nation” drew a hard line on federal spending which lawmakers are bracing to do battle over in january when they return from the holidays on wednesday lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues such as immigration policy even as the november congressional election campaigns approach in which republicans will seek to keep control of congress president donald trump and his republicans want a big budget increase in military spending while democrats also want proportional increases for nondefense “discretionary” spending on programs that support education scientific research i

Tokenization

In [12]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['text'] = df['text'].apply(lambda x : tokenizer.tokenize(x))
df['text'].head()
                                         

0    [washington, reuters, the, head, of, a, conser...
1    [washington, reuters, transgender, people, wil...
2    [washington, reuters, the, special, counsel, i...
3    [washington, reuters, trump, campaign, adviser...
4    [seattlewashington, reuters, president, donald...
Name: text, dtype: object

Stop word removal

In [13]:
stop_words = set(stopwords.words('english'))
df['text'] =df['text'].apply(lambda x:[word for word in x if word not in stop_words])
df['text'].head()

0    [washington, reuters, head, conservative, repu...
1    [washington, reuters, transgender, people, all...
2    [washington, reuters, special, counsel, invest...
3    [washington, reuters, trump, campaign, adviser...
4    [seattlewashington, reuters, president, donald...
Name: text, dtype: object

In [14]:
def combine_text(list_of_text):
    combine_text = ' '.join(list_of_text)
    return combine_text
df['text'] = df['text'].apply(lambda x : combine_text(x))
df['text'].head()

0    washington reuters head conservative republica...
1    washington reuters transgender people allowed ...
2    washington reuters special counsel investigati...
3    washington reuters trump campaign adviser geor...
4    seattlewashington reuters president donald tru...
Name: text, dtype: object

Defining Dependent and Independent 

In [15]:
x = df['text']
y = df['target']

### Train - Test Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3,random_state = 42)

In [17]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(31428,)
(13470,)
(31428,)
(13470,)


### Count Vectorizer

In [18]:
count_vectorizer = CountVectorizer()
train_x_vector = count_vectorizer.fit_transform(x_train)
test_x_vector = count_vectorizer.transform(x_test)

In [19]:
print(df['text'].head()[0])
print(train_x_vector[0].todense())

washington reuters head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection trump administration already willing say going increase nondefense discretionary spending percent meadows chairman small influential house freedom caucus said program democrats saying enough n

In [20]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(train_x_vector,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [21]:
pred_lr = LR.predict(test_x_vector)

In [22]:
LR.score(test_x_vector, y_test)

0.995916852264291

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6996
           1       0.99      1.00      1.00      6474

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470

