# Importing the libraries


In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:

import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/yusuf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Data Preprocessing

In [5]:
fake = pd.read_csv('datasets/fake.csv')
true = pd.read_csv('datasets/true.csv')

### Merging the datasets

In [6]:
fake['label'] = 0
true['label'] = 1

In [7]:
print(fake.shape)
print(true.shape)

(23481, 5)
(21417, 5)


In [8]:
df_combined = pd.concat([true, fake], ignore_index = True)
df_combined.to_csv('datasets/news.csv', index = False)

In [9]:
df = pd.read_csv('datasets/news.csv')

In [10]:
df.shape

(44898, 5)

### To check for missing and duplicate values

In [11]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
df.duplicated().sum()

np.int64(209)

In [13]:
df.drop_duplicates()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [15]:
df.shape

(44898, 5)

In [None]:
#X = df.drop(columns = 'label', axis = 1)
#y = df['label']

### Stemming

In [20]:
port_stem = PorterStemmer()

In [24]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)  
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stop_words = set(stopwords.words('english'))  
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stop_words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [25]:
df['text'] = df['text'].apply(stemming)

In [26]:
df['title'] = df['title'].apply(stemming)

In [28]:
df['subject'] = df['subject'].apply(stemming)

In [31]:
df = df.drop(columns = 'date', axis = 1)

### Vectorizing

In [33]:
df['combined_text'] = df['title'] + ' ' + df['text'] + ' ' + df['subject']

In [34]:
df = df.drop(columns = ['text', 'subject', 'title'], axis = 1)

In [38]:
X = df['combined_text']
y = df['label']

In [42]:
vectorizer = TfidfVectorizer()

vectorizer.fit(X)

X = vectorizer.transform(X)

## train test split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 2)

### Training the model

In [46]:
model = LogisticRegression()

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### Metric

In [48]:
pred = model.predict(X_test)
print(accuracy_score(pred, y_test))

0.9898663697104677


### Save the model

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)