In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data.csv")

In [3]:
# Looking at the shape of the dataframe

df.shape

(4009, 4)

In [4]:
df.head(3)

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1


In [5]:
# Checking for balance in dataset lables

print(df['Label'].value_counts())

"""We can observe that the dataset is roughly balanced."""

0    2137
1    1872
Name: Label, dtype: int64


'We can observe that the dataset is roughly balanced.'

In [6]:
# Checking for missing values in the dataset

print(df.isna().sum())

"""We can observe that the feature "body" has 21 missing values. We will drop these values to continue to the next step"""

URLs         0
Headline     0
Body        21
Label        0
dtype: int64


'We can observe that the feature "body" has 21 missing values. We will drop these values to continue to the next step'

In [7]:
# Dropping the rows with missing values

df.dropna(inplace=True)

In [8]:
# Dividing the dataset into traina and test, so as to avoid data leakage

from sklearn.model_selection import train_test_split

X = df.drop('Label',axis=1)
y = df['Label']

In [9]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.25,random_state=123)

In [10]:
train_y = pd.DataFrame(train_y,columns=["Label"])

In [11]:
train = pd.concat([train_X,train_y],axis=1)

In [12]:
train

Unnamed: 0,URLs,Headline,Body,Label
239,http://beforeitsnews.com/u-s-politics/2017/10/...,"After Alleged 'Moron' Remark, Tillerson Stroke...",Warning Something Big Is About to Happen in Am...,0
421,http://beforeitsnews.com/sports/2017/10/101-tr...,"10/1 TRS-PNC Era: Wildcard Win, TRS Closes, Fr...","No Getting Around it, The War Is Coming! Trump...",0
711,http://www.cnn.com/2017/10/06/entertainment/ho...,Hollywood mostly silent on Weinstein allegations,(CNN) More than a day after The New York Times...,1
2539,https://www.nytimes.com/2017/10/10/world/middl...,Egyptian Activist Receives a Top Human Rights ...,"Mr. Zaree, whom the government has barred from...",1
847,https://www.nytimes.com/2017/10/10/arts/televi...,Harvey Weinstein Draws Sharp Attacks From Late...,"Photo\nWelcome to Best of Late Night, a rundow...",1
...,...,...,...,...
1131,http://beforeitsnews.com/sports/2017/09/kids-f...,Kids & Fitness....Get the Kids Off the Couch &...,Kids & Fitness….Get the Kids Off the Couch & H...,0
1357,http://beforeitsnews.com/sports/2017/09/10-imp...,10 Important Stories From 09/27/17 Box Scores:...,10 Important Stories From 09/27/17 Box Scores:...,0
3474,https://www.nytimes.com/2017/10/11/technology/...,The Frightful Five Want to Rule Entertainment....,Why start with the culture industries? The Fiv...,1
3457,https://www.nytimes.com/2017/10/10/world/middl...,Palestinian Rivals Talk Unity but Show Divisions,Hamas says its maintenance of control over its...,1


In [13]:
# Printing the head of training data

train.head(3)

Unnamed: 0,URLs,Headline,Body,Label
239,http://beforeitsnews.com/u-s-politics/2017/10/...,"After Alleged 'Moron' Remark, Tillerson Stroke...",Warning Something Big Is About to Happen in Am...,0
421,http://beforeitsnews.com/sports/2017/10/101-tr...,"10/1 TRS-PNC Era: Wildcard Win, TRS Closes, Fr...","No Getting Around it, The War Is Coming! Trump...",0
711,http://www.cnn.com/2017/10/06/entertainment/ho...,Hollywood mostly silent on Weinstein allegations,(CNN) More than a day after The New York Times...,1


In [14]:
# Re-arranging the index of training data

train.index = range(len(train))

In [15]:
train.head(3)

Unnamed: 0,URLs,Headline,Body,Label
0,http://beforeitsnews.com/u-s-politics/2017/10/...,"After Alleged 'Moron' Remark, Tillerson Stroke...",Warning Something Big Is About to Happen in Am...,0
1,http://beforeitsnews.com/sports/2017/10/101-tr...,"10/1 TRS-PNC Era: Wildcard Win, TRS Closes, Fr...","No Getting Around it, The War Is Coming! Trump...",0
2,http://www.cnn.com/2017/10/06/entertainment/ho...,Hollywood mostly silent on Weinstein allegations,(CNN) More than a day after The New York Times...,1


In [16]:
# Checking for the balance in labels 

print(train['Label'].value_counts())

"""We can observe that a rough balance is maintained in the label"""

0    1584
1    1407
Name: Label, dtype: int64


'We can observe that a rough balance is maintained in the label'

In [17]:
# Importing library required for text processing

import nltk 

import string

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

In [18]:
# Defining a function for text pre-processing of the data

def text_preprocessing(df,list_name):
    
    # Merging all the feature into one sentence and appending it into the list
    for sent in range(0,len(df)):
        list_name.append(" ".join(str(sentence) for sentence in df.iloc[sent,:3]))
    
    # Converting all the letters in the sentence to lower caps
    for num in range(len(list_name)):
        list_name[num] = list_name[num].lower()
    
    #Splitting the sentences into words by using word tokenize and removing the stopwords simaltaneously applying
    #lemmatizer to words
    for y in range(0,len(df)):
        collection = nltk.word_tokenize(list_name[y])
        collections = [lemmatizer.lemmatize(sentence) for sentence in collection if sentence not in set(stopwords.words("english"))]
        list_name[y] = " ".join(collections)
    
    #Elminating punctuations from sentences as they do not add value. 
    for ind in range(len(df)):
        for x in list_name[ind]:
            if x in string.punctuation:
                list_name[ind] = list_name[ind].replace(x,"")

In [19]:
training_data = []

In [20]:
# Creating an object for WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [21]:
# Passing the training data to function text_preprocessing


text_preprocessing(train,training_data)

In [22]:
training_data[0]



In [23]:
train.head(1)

Unnamed: 0,URLs,Headline,Body,Label
0,http://beforeitsnews.com/u-s-politics/2017/10/...,"After Alleged 'Moron' Remark, Tillerson Stroke...",Warning Something Big Is About to Happen in Am...,0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))

In [26]:
vectorized_train = vectorizer.fit_transform(training_data)

In [27]:
# Displaying Tfidf vectors in the form of a dataframe

pd.DataFrame(vectorized_train.toarray(),columns=vectorizer.get_feature_names())

Unnamed: 0,00,00 1h,00 1h started,00 2h,00 2h put,00 draw,00 draw peru,00 draw yemen,00 dublin,00 dublin march,...,əˈnänəməs adjective person,ʺevery,ʺevery one,ʺevery one friend,ʺhe,ʺhe missed,ʺhe missed knew,殆wwreverbnationcomcarlthornton4,殆wwreverbnationcomcarlthornton4 aboutme,殆wwreverbnationcomcarlthornton4 aboutme http
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
model = RandomForestClassifier()

In [30]:
model.fit(vectorized_train,train_y)

  model.fit(vectorized_train,train_y)


RandomForestClassifier()

In [31]:
test_y_ =  pd.DataFrame(test_y,columns=["Label"])

In [32]:
test = pd.concat([test_X,test_y_],axis=1)

In [33]:
test.head()

Unnamed: 0,URLs,Headline,Body,Label
2881,http://beforeitsnews.com/sports/2017/10/can-ra...,Can Ravens End Losing Skid?,Can Ravens End Losing Skid?\n% of readers thin...,0
1683,https://www.reuters.com/article/us-japan-kobes...,Kobe Steel unit found to be falsifying data: N...,FILE PHOTO: A man walks past the signboard of ...,1
903,http://beforeitsnews.com/sports/2017/10/2017-f...,2017 Fantasy Football Quarterback Rankings - W...,2017 Fantasy Football Quarterback Rankings – W...,0
1740,http://beforeitsnews.com/sports/2017/10/ravens...,Ravens @ Raiders - Friday/Game Status,An Embattled Pharmaceutical Company That Sells...,0
72,http://beforeitsnews.com/entertainment/2017/09...,Some Tips for Selecting The Best Flower Bouque...,Some Tips for Selecting The Best Flower Bouque...,0


In [34]:
# Applying preprocessing function for test data

testing_data= []

text_preprocessing(test,testing_data)

In [35]:
# Saving the predictions for test data into a variable

pred = model.predict(vectorizer.transform(testing_data))

In [36]:
#Importing accuracy score evalutaion metric from sklearn

from sklearn.metrics import accuracy_score

In [37]:
score = accuracy_score(test_y,pred)

In [38]:
# Final prediction score of the model 

score

0.9578736208625878

In [39]:
# Importing pickle and pushing the model into a pickle file

import pickle

In [40]:
file = open("model.pkl","wb")
pickle.dump(model,file)

In [41]:
vector = open("vectorizer.pkl","wb")
pickle.dump(vectorizer,vector)

In [42]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
