In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
#Reading Dataset
dataset = pd.read_csv('news.csv')

In [3]:
#Checking Information of our dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [4]:
#Take a glance at our dataset
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
#Dropping the missing values of nan values
dataset = dataset.dropna()

In [6]:
#Again have a glance to our dataset
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
#Extract Dependent and Independent Variable
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

In [8]:
#Let's Check 
X[0]

array([8476, 'You Can Smell Hillary’s Fear',
      dtype=object)

In [9]:
Y[0]

'FAKE'

In [10]:
#Converting Text Data into Numerical Values
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=12000)
mat_body = cv.fit_transform(X[:,1]).todense()
cv_head = CountVectorizer(max_features=12000)
mat_head = cv_head.fit_transform(X[:,1]).todense()

In [11]:
#Matrix of Count of words
mat_body

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
mat_head

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
#Stacking top on body
X_mat = np.hstack((mat_head , mat_body))

In [None]:
#Dividing dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_mat, Y, test_size=0.2, random_state=42)

In [None]:
#Applying Standard Scaling to get Optimized Result
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
#Now Our dataset is ready to perform Machine Learning Algorithm Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier(n_estimators = 201)
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
classification_report(Y_test, Y_pred)

In [None]:
#Creating a Confusion Matrix
confusion_matrix(Y_test, Y_pred)

In [None]:
#Checking accuracy of the model
accuracy_score(Y_test, Y_pred)

In [None]:
#To increase accuracy, use Cross Validation Score
from sklearn.model_selection import cross_val_score
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = Y_train, cv = 10)
rfc_eval.mean()