In [4]:
import pandas as pd
import nltk
from nltk import FreqDist
import sklearn

## 1. Reading the two CSVs (Real News & Fake News)

In [5]:
real_news = pd.read_csv("True.csv")
fake_news = pd.read_csv("Fake.csv")

In [6]:
real_news['is_fake'] = 0
real_news.head()

Unnamed: 0,title,text,subject,date,is_fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [7]:
fake_news['is_fake'] = 1
fake_news.head()

Unnamed: 0,title,text,subject,date,is_fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [8]:
len(real_news)

21417

In [9]:
len(fake_news)

23481

### Concatenating the 2 dataframes with an additional is_fake flag as column to differentiate between the 2 news

In [10]:
all_news_df = pd.concat([real_news,fake_news])

#### Shuffling after conatenating

In [11]:
all_news_df = all_news_df.sample(frac=1).reset_index(drop=True)

In [12]:
len(all_news_df)

44898

In [13]:
all_news_df.head()

Unnamed: 0,title,text,subject,date,is_fake
0,Togo forces fire tear gas to disperse Gnassing...,LOME (Reuters) - Togo security forces fired te...,worldnews,"September 7, 2017",0
1,Jason Chaffetz Runs Away Like A Coward After ...,Things got very uncomfortable for Jason Chaffe...,News,"February 10, 2017",1
2,EU's Juncker courts eurosceptic easterners wit...,BRUSSELS (Reuters) - European Commission head ...,worldnews,"October 4, 2017",0
3,Thousands of Somalis gather to mourn bomb victims,MOGADISHU (Reuters) - Thousands of Somalis pra...,worldnews,"October 20, 2017",0
4,OBAMA WARNS: Crackdown On Terrorism In U.S. Wo...,Never mind that Iran continues to violate Obam...,left-news,"Dec 21, 2015",1


#### Dropping the columns subject and date which have no role in detecting fake news

In [14]:
all_news_df = all_news_df.drop(['subject','date','title'], axis=1)

In [15]:
all_news_df.head()

Unnamed: 0,text,is_fake
0,LOME (Reuters) - Togo security forces fired te...,0
1,Things got very uncomfortable for Jason Chaffe...,1
2,BRUSSELS (Reuters) - European Commission head ...,0
3,MOGADISHU (Reuters) - Thousands of Somalis pra...,0
4,Never mind that Iran continues to violate Obam...,1


#### Converting DataFrame to Dictionary

In [16]:
all_news_dict = all_news_df.reset_index().to_dict(orient='list')

#### Tokenizing the sentences using nltk word tokenizer and also converting all the words to lower case

In [18]:
text_tokens = []
for el in all_news_dict['text']:
    tokens = nltk.word_tokenize(el)
    words = [w.lower() for w in tokens]
    text_tokens.append(words)

In [19]:
len(text_tokens)

44898

#### Splitting the data into train and test with 70:30 split

In [20]:
import math
text_tokens_train = text_tokens[0:int(math.floor(len(text_tokens)*0.7))]
text_tokens_test = text_tokens[int(math.ceil(len(text_tokens)*0.7)):int(len(text_tokens))]

In [21]:
print(len(text_tokens_train),len(text_tokens_test))

31428 13469


#### Flattening the list of lists into a single list to create the corpus

In [22]:
text_tokens_train_flat = [item for sublist in text_tokens_train for item in sublist]

## A. Identifying the top 2000 Bigrams from Training data to be used as features

In [23]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text_tokens_train_flat)

#### Filter to remove Special characters

In [24]:
import re
def alpha_filter(w):
  # pattern to match a word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    if (pattern.match(w)):
        return True
    else:
        return False

#### Stop Words removal filter

In [25]:
nltk.download('stopwords')
nltkstopwords = nltk.corpus.stopwords.words('english')

#To accommodate this tokenization, we will add some stopwords that have the apostrophe together with the contraction.
morestopwords = ['could','also','may','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve"]

stopwords = nltkstopwords + morestopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Identifying the top 2000 bigrams based on Mutual Information score to use these top 2000 bigrams as features

In [26]:
finder.apply_word_filter(alpha_filter)
finder.apply_word_filter(lambda w: w in stopwords)

In [28]:
finder.apply_freq_filter(5)  ### Minimum Frequency of 5
scored_mi = finder.score_ngrams(bigram_measures.pmi)   ### GETTING the Mutual Information Scores
for bscore in scored_mi[:50]:
    print (bscore)

(('//t.co/ltdtbehhgh', 'pic.twitter.com/t2s8ufif5o'), 21.44156887771257)
(('666b', '314a'), 21.44156887771257)
(('6a7a', '4d6c'), 21.44156887771257)
(('84b4', 'f787'), 21.44156887771257)
(('86f7', 'a737'), 21.44156887771257)
(('circumstances.oleg', 'erovinkin'), 21.44156887771257)
(('darz', 'aab'), 21.44156887771257)
(('doxycycline', 'hyclate'), 21.44156887771257)
(('gwede', 'mantashe'), 21.44156887771257)
(('helter', 'skelter'), 21.44156887771257)
(('kevork', 'djansezian/getty'), 21.44156887771257)
(('laissez', 'faire'), 21.44156887771257)
(('maale', 'adumim'), 21.44156887771257)
(('moqtada', 'al-sadr'), 21.44156887771257)
(('palos', 'verdes'), 21.44156887771257)
(('pic.twitter.com/pxbrcgypwm', "'gitmo"), 21.44156887771257)
(('porir', 'dwip'), 21.44156887771257)
(('shailene', 'woodley'), 21.44156887771257)
(('sobhan', 'chowdhury'), 21.44156887771257)
(('tomomi', 'inada'), 21.44156887771257)
(('dady', 'chery'), 21.178534471878773)
(('hammam', 'al-alil'), 21.178534471878773)
(('jakobsd'

In [29]:
scored_mi[:2000]

[(('//t.co/ltdtbehhgh', 'pic.twitter.com/t2s8ufif5o'), 21.44156887771257),
 (('666b', '314a'), 21.44156887771257),
 (('6a7a', '4d6c'), 21.44156887771257),
 (('84b4', 'f787'), 21.44156887771257),
 (('86f7', 'a737'), 21.44156887771257),
 (('circumstances.oleg', 'erovinkin'), 21.44156887771257),
 (('darz', 'aab'), 21.44156887771257),
 (('doxycycline', 'hyclate'), 21.44156887771257),
 (('gwede', 'mantashe'), 21.44156887771257),
 (('helter', 'skelter'), 21.44156887771257),
 (('kevork', 'djansezian/getty'), 21.44156887771257),
 (('laissez', 'faire'), 21.44156887771257),
 (('maale', 'adumim'), 21.44156887771257),
 (('moqtada', 'al-sadr'), 21.44156887771257),
 (('palos', 'verdes'), 21.44156887771257),
 (('pic.twitter.com/pxbrcgypwm', "'gitmo"), 21.44156887771257),
 (('porir', 'dwip'), 21.44156887771257),
 (('shailene', 'woodley'), 21.44156887771257),
 (('sobhan', 'chowdhury'), 21.44156887771257),
 (('tomomi', 'inada'), 21.44156887771257),
 (('dady', 'chery'), 21.178534471878773),
 (('hammam', 

#### Extracting the top 2000 Bigrams only from the Bigram MI Score tuples

In [30]:
scored_mi_bigrams = []
for el in scored_mi[:2000]:
    scored_mi_bigrams.append(el[0])

#### Displaying the top 2000 Bigrams which will be used as Features

In [32]:
scored_mi_bigrams

[('//t.co/ltdtbehhgh', 'pic.twitter.com/t2s8ufif5o'),
 ('666b', '314a'),
 ('6a7a', '4d6c'),
 ('84b4', 'f787'),
 ('86f7', 'a737'),
 ('circumstances.oleg', 'erovinkin'),
 ('darz', 'aab'),
 ('doxycycline', 'hyclate'),
 ('gwede', 'mantashe'),
 ('helter', 'skelter'),
 ('kevork', 'djansezian/getty'),
 ('laissez', 'faire'),
 ('maale', 'adumim'),
 ('moqtada', 'al-sadr'),
 ('palos', 'verdes'),
 ('pic.twitter.com/pxbrcgypwm', "'gitmo"),
 ('porir', 'dwip'),
 ('shailene', 'woodley'),
 ('sobhan', 'chowdhury'),
 ('tomomi', 'inada'),
 ('dady', 'chery'),
 ('hammam', 'al-alil'),
 ('jakobsd', 'ttir'),
 ('jaret', 'seiberg'),
 ('nykea', 'aldridge'),
 ('qais', 'al-khazali'),
 ('semper', 'fi'),
 ('ahl', 'al-haq'),
 ('aqeel', 'al-tayyar'),
 ('bluecross', 'blueshield'),
 ('djamel', 'beghal'),
 ('haron', 'monis'),
 ('kel', 'inen'),
 ('lahouij', 'bouhlel'),
 ('mata', 'pires'),
 ('mondaiale', 'commerciale'),
 ('psi', 'upsilon'),
 ('samy', 'amimour'),
 ('satya', 'nadella'),
 ('aboul', 'gheit'),
 ('anirvan', 'ghos

## Preparing the Train and Test Dataframes using the Top 2000 bigrams of Train data as Features (HOLD OUT method of model performance evaluation)

### 1. Preparing Train Dataset

#### Converting all the documents in the dataset to bigrams - currently, the documents have their sentences tokenized into words

In [33]:
text_bigrams_train = []
for el in text_tokens_train:
    bigrams = [(el[i],el[i+1]) for i in range(0,len(el)-1)]
    text_bigrams_train.append(bigrams)

In [34]:
len(text_bigrams_train)

31428

In [36]:
text_bigrams_train[0:2]

[[('lome', '('),
  ('(', 'reuters'),
  ('reuters', ')'),
  (')', '-'),
  ('-', 'togo'),
  ('togo', 'security'),
  ('security', 'forces'),
  ('forces', 'fired'),
  ('fired', 'tear'),
  ('tear', 'gas'),
  ('gas', 'at'),
  ('at', 'hundreds'),
  ('hundreds', 'of'),
  ('of', 'anti-government'),
  ('anti-government', 'protesters'),
  ('protesters', 'carrying'),
  ('carrying', 'out'),
  ('out', 'a'),
  ('a', 'late'),
  ('late', 'night'),
  ('night', 'sit-in'),
  ('sit-in', 'at'),
  ('at', 'an'),
  ('an', 'intersection'),
  ('intersection', 'in'),
  ('in', 'central'),
  ('central', 'lome'),
  ('lome', 'as'),
  ('as', 'part'),
  ('part', 'of'),
  ('of', 'a'),
  ('a', 'bid'),
  ('bid', 'to'),
  ('to', 'end'),
  ('end', 'the'),
  ('the', '50-year-old'),
  ('50-year-old', 'gnassingbe'),
  ('gnassingbe', 'family'),
  ('family', 'dynasty'),
  ('dynasty', ','),
  (',', 'witnesses'),
  ('witnesses', 'said'),
  ('said', 'on'),
  ('on', 'thursday'),
  ('thursday', '.'),
  ('.', 'the'),
  ('the', 'move')

#### Using the top 2000 bigrams as features and creating 1/0 flags as values in the training dataset
#### The 1/0 flags denote the presence of the bigram feature in the document

In [37]:
text_bigrams_train_dict = {}
for feature_el in scored_mi_bigrams:
    flags = []
    for doc_el in text_bigrams_train:
        if feature_el in doc_el:
            flags.append(1)
        else:
            flags.append(0)
    text_bigrams_train_dict[feature_el] = flags

In [38]:
text_bigrams_train_dict[('laissez', 'faire')]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [39]:
len(text_bigrams_train_dict[('laissez', 'faire')])

31428

In [49]:
text_bigrams_train_dict['is_fake'] = all_news_dict['is_fake'][0:int(math.floor(len(text_tokens)*0.7))]

#### Writing to Train DF

In [50]:
text_bigrams_train_df = pd.DataFrame.from_dict(text_bigrams_train_dict)

In [54]:
#### Storing in CSV
text_bigrams_train_df.to_csv("train_df.csv",index = False)

### 2. Preparing Test Dataset

#### Converting all the documents in the Test dataset to bigrams - currently, the documents have their sentences tokenized into words

In [51]:
text_bigrams_test = []
for el in text_tokens_test:
    bigrams = [(el[i],el[i+1]) for i in range(0,len(el)-1)]
    text_bigrams_test.append(bigrams)

In [52]:
len(text_bigrams_test)

13469

In [53]:
text_bigrams_test[0:2]

[[('united', 'nations'),
  ('nations', '('),
  ('(', 'reuters'),
  ('reuters', ')'),
  (')', '-'),
  ('-', 'u.s.'),
  ('u.s.', 'secretary'),
  ('secretary', 'of'),
  ('of', 'state'),
  ('state', 'rex'),
  ('rex', 'tillerson'),
  ('tillerson', 'and'),
  ('and', 'iranian'),
  ('iranian', 'foreign'),
  ('foreign', 'minister'),
  ('minister', 'javad'),
  ('javad', 'zarif'),
  ('zarif', 'spoke'),
  ('spoke', 'directly'),
  ('directly', 'during'),
  ('during', 'talks'),
  ('talks', 'at'),
  ('at', 'the'),
  ('the', 'united'),
  ('united', 'nations'),
  ('nations', 'on'),
  ('on', 'wednesday'),
  ('wednesday', 'in'),
  ('in', 'what'),
  ('what', 'a'),
  ('a', 'european'),
  ('european', 'diplomat'),
  ('diplomat', 'described'),
  ('described', 'as'),
  ('as', 'a'),
  ('a', 'very'),
  ('very', 'difficult'),
  ('difficult', 'meeting'),
  ('meeting', '.'),
  ('.', 'tillerson'),
  ('tillerson', 'and'),
  ('and', 'zarif'),
  ('zarif', 'spoke'),
  ('spoke', 'directly'),
  ('directly', 'for'),
  ('f

#### Using the top 2000 bigrams as features and creating 1/0 flags as values in the training dataset
#### The 1/0 flags denote the presence of the bigram feature in the document

In [55]:
text_bigrams_test_dict = {}
for feature_el in scored_mi_bigrams:
    flags = []
    for doc_el in text_bigrams_test:
        if feature_el in doc_el:
            flags.append(1)
        else:
            flags.append(0)
    text_bigrams_test_dict[feature_el] = flags

In [56]:
text_bigrams_test_dict[('laissez', 'faire')]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [58]:
len(text_bigrams_test_dict[('laissez', 'faire')])

13469

In [59]:
text_bigrams_test_dict['is_fake'] = all_news_dict['is_fake'][int(math.ceil(len(text_tokens)*0.7)):int(len(text_tokens))]

In [60]:
text_bigrams_test_df = pd.DataFrame.from_dict(text_bigrams_test_dict)

In [62]:
text_bigrams_test_df.head()

Unnamed: 0,"(//t.co/ltdtbehhgh, pic.twitter.com/t2s8ufif5o)","(666b, 314a)","(6a7a, 4d6c)","(84b4, f787)","(86f7, a737)","(circumstances.oleg, erovinkin)","(darz, aab)","(doxycycline, hyclate)","(gwede, mantashe)","(helter, skelter)",...,"(jeremiah, wright)","(lopez, obrador)","(omar, mostefa)","(itfurther, resolved)","(sir, edmund)","(sun, zhengcai)","(emoluments, clause)","(inconvenient, sequel)","(fortified, demilitarized)",is_fake
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
#### Storing in CSV
text_bigrams_test_df.to_csv("test_df.csv",index = False)

In [66]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [68]:
X_train = text_bigrams_train_df.drop('is_fake',axis = 1)
y_train = text_bigrams_train_df['is_fake']
X_test = text_bigrams_test_df.drop('is_fake',axis = 1)
y_test = text_bigrams_test_df['is_fake']

In [71]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(31428, 2000) (31428,) (13469, 2000) (13469,)


### 1. Training Gaussian Naive Bayes model and evaluating the model performance

In [75]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Evaluating the performance of Naive Bayes model

In [76]:
gnb_pred = gnb.predict(X_test)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, gnb_pred)*100, 2)}%")

Accuracy: 58.25%


### 2. Training Logistic Regression model

In [86]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [87]:
logr_pipe = make_pipeline(StandardScaler(), LogisticRegression(solver='lbfgs'))
logr_pipe.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

#### Evaluating the performance of Logistic Regression model

In [88]:
metrics.accuracy_score(y_test, logr_pipe.predict(X_test))

0.6662706956715421

### 3. Training Random Forest Classifier Model

In [80]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, max_features=5, random_state=16)
results = cross_val_score(model, X_train, y_train)
print(f"Accuracy: {round(results.mean()*100, 2)}%")                                                                            



Accuracy: 67.97%


In [82]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=16, verbose=0,
                       warm_start=False)

#### Evaluating the performance of Random Forest model

In [83]:
metrics.accuracy_score(y_test, model.predict(X_test))

0.6702056574355928

### 4. Training Gradient Boosted Trees model

In [90]:
from sklearn.ensemble import GradientBoostingClassifier as gbm
model_gbm = gbm(n_estimators=1000, random_state=16)
results = cross_val_score(model_gbm, X_train, y_train, cv=3)
print(f"Accuracy for GBM: {round(results.mean()*100, 2)}%")

Accuracy for GBM: 65.73%


In [91]:
model_gbm.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='auto',
                           random_state=16, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

#### Evaluating the performance of Gradient Boosted Trees baseline model

In [99]:
metrics.accuracy_score(y_test, model_gbm.predict(X_test))

0.654837033187319

#### Performing Grid Search on Gradient Boosted Trees model

In [100]:
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = {'learning_rate': np.arange(0.05, 0.1, 0.15),
              'n_estimators': range(600,900,1200),
              'max_depth': range(2,3,5)}
clf = GridSearchCV(gbm(), param_grid, cv = 5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

#### Accuracy of tuned model is lower than baseline model with default parameters

In [101]:
metrics.accuracy_score(y_test, clf.best_estimator_.predict(X_test))

0.6155616601083971