# 1. Importing the libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import mglearn
import seaborn as sns
%matplotlib inline

# 2. Imporiting the Dataset

In [2]:
df=pd.read_csv('news.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
text=df['text']
labels=df['label']

In [4]:
print("Type of text: {}".format(type(text)))
print("Length of text: {}".format(len(text)))
print("Text:\n{}".format(text[:5]))

Type of text: <class 'pandas.core.series.Series'>
Length of text: 6335
Text:
0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object


In [5]:
label={'FAKE':0,'REAL':1}
labels=[label[item] for item in labels]
print(labels)

[0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 

# 3. Splitting the Model in train and test models

In [7]:
from sklearn.model_selection import train_test_split
text_train,text_test,y_train,y_test=train_test_split(text,labels,test_size=0.2,random_state=42)
print("text_train.shape:{}".format(text_train.shape))
print("text_test.shape:{}".format(text_test.shape))

text_train.shape:(5068,)
text_test.shape:(1267,)


# 4. Using bag of words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer().fit(text_train)
X_train=vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<5068x61667 sparse matrix of type '<class 'numpy.int64'>'
	with 1713334 stored elements in Compressed Sparse Row format>


In [9]:
feature_names=vect.get_feature_names()
print("Total No of feature names:{}".format(len(feature_names)))

Total No of feature names:61667


In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores=cross_val_score(LogisticRegression(),X_train,y_train,cv=5)


Mean cross val score:0.9159455675819863


In [12]:
print("Mean cross val score:{:.2f}".format(np.mean(scores)))

Mean cross val score:0.92


In [13]:
# using grid search for best parameters
from sklearn.model_selection import GridSearchCV
param_grid={"C":[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(LogisticRegression(),param_grid,cv=5)
grid.fit(X_train,y_train)
print("The best score: {}".format(grid.best_score_))
print("The best parameter: {}".format(grid.best_params_))

The best score: 0.9200905000282326
The best parameter: {'C': 0.1}


In [17]:
# Building model and predicting
logreg=LogisticRegression(C=0.1).fit(X_train,y_train)
X_test=vect.transform(text_test)
y_pred=logreg.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
print("Test Score: {:.2f}".format(accuracy_score(y_test,y_pred)))

Test Score: 0.92


In [20]:
# Using tfidfvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(TfidfVectorizer(norm=None),LogisticRegression())
param_grid={'logisticregression__C':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(text_train,y_train)
print("Best Score: {}".format(grid.best_score_))
print("Best Parameters: {}".format(grid.best_params_))
print("Text Score: {}".format(grid.score(text_test,y_test)))

Best Score: 0.9275877108438427
Best Parameters: {'logisticregression__C': 0.01}
Text Score: 0.930544593528019


In [21]:
# Using tfidfvectorizer and n-grams bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(TfidfVectorizer(min_df=5),LogisticRegression())
param_grid={'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],'logisticregression__C':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(text_train,y_train)
print("Best Score: {}".format(grid.best_score_))
print("Best Parameters: {}".format(grid.best_params_))
print("Text Score: {}".format(grid.score(text_test,y_test)))

Best Score: 0.9364669552231251
Best Parameters: {'logisticregression__C': 100, 'tfidfvectorizer__ngram_range': (1, 3)}
Text Score: 0.9329123914759274
