In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import preprocess_string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('news_articles.csv',na_filter='')
df = df[df['language']=='english']
data = df[['title','text','label']]
data['label'] = np.where(data['label']=='Real',1,0)

In [3]:
data['News'] = data['title'] + data['text']
data['News'] = data['News'].apply(lambda x: preprocess_string(x))

In [4]:
sw = stopwords.words('english')

In [5]:
def sw_remove(x):
    return [i for i in x if i not in sw]

In [6]:
data['News'] = data['News'].apply(lambda x:sw_remove(x))

In [7]:
data['title_01'] = data['title'].apply(lambda x:1 if len(x)>=1 else 0)
data['text_01'] = data['text'].apply(lambda x:1 if len(x)>=1 else 0)

In [8]:
port_stem = PorterStemmer()

In [9]:
# data['News'] = data['News'].apply(lambda x: x.split())

In [10]:
def f(x):
    return ' '.join([port_stem.stem(token) for token in x])

In [11]:
data['News'] = data['News'].apply(lambda x: f(x))

In [12]:
X = data['News']
y = data['label']

In [13]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
X = X.toarray()

In [14]:
col = data[['title_01','text_01']].values.reshape(-1,2)
X = np.concatenate((col, X), axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

# logistic regression

In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)
# accuracy score on the test data
y_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7277227722772277


# RandomForestClassifier

In [17]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7475247524752475


In [31]:
testdf = pd.DataFrame({'Actual':y_test,'Prediction':y_pred})

In [32]:
import pandas as pd
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

testdf.replace([1,0],['Real','Fake'])

Unnamed: 0,Actual,Prediction
1469,Real,Real
42,Fake,Fake
559,Fake,Fake
165,Fake,Fake
1236,Fake,Fake
2011,Real,Fake
1918,Real,Fake
1614,Real,Fake
64,Fake,Fake
173,Fake,Fake


# DecisionTreeClassifier

In [18]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7103960396039604


# Performance tuning

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [20]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [21]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,n_jobs=-1,verbose=3)
CV_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             verbose=3)

In [22]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [24]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=8, criterion='gini')

In [25]:
rfc1.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rfc1.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.6089108910891089
