In [22]:
import numpy as np
import pandas as pd
import re

# this is going to be used for stopwords removal 
from nltk.corpus import stopwords

# stemming of words
from nltk.stem.porter import PorterStemmer

# to vectorize the words in the sentences 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# to perform the classification 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

In [10]:
 data = pd.read_csv("train.csv")

In [30]:
data.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [33]:
# below is the data from the 2nd column where the text and its corresponding label is given 
print(data['text'][1])
print('##########')
print(data['label'][1])

Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.  Speaking of Great Expectations, Hillary Rodham overflowed with them 48 years ago when she first addressed a Wellesley graduating class. The president of the college informed those gathered in 1969 that the students needed “no debate so far as I could ascertain as to who their spokesman was to be” (kind of the like the Democratic primaries in 2016 minus the   terms unknown then even at a Seven Sisters school). “I am very glad that Miss Adams made it clear that what I am speaking for today is all of us —  the 400 of us,” Miss Rodham told her classmates. After appointing herself Edger Bergen to the Charlie McCarthys and Mort

In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhishekkumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
#print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [40]:
# this the null values that we have in the data attributes.
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [41]:
# we will be replacing the null values with empty strings
data = data.fillna('')

In [43]:
data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [46]:
# as the text column contains a large text therefore it may turn out to be expensive.
# hence we will be using title and author name for the classification of the news.

data['content'] = data['title']+' '+data['author']

In [56]:
 X = data.drop(columns = 'label' , axis=1)

In [57]:

y = data.drop(columns=['id','title','author','text','content'],axis = 1)

In [58]:
X.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...


In [59]:
y.head()

Unnamed: 0,label
0,1
1,0
2,1
3,1
4,1


In [69]:
# now we will create a function to perform stemming with stop words removal and removal of special characters.
stemmer = PorterStemmer()
def stemming(content):
    
    # this removes everything that is not a word (a-z and A-Z) and replace it with ' '
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    
    # applying the stemmer to stem the words 
    stemmed_content = [stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    
    #Appending the result to the data 
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
    
    

In [70]:
 data.shape
    

(20800, 6)

In [71]:
data.head(5)

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [74]:
# here we are applying the stemming function to each attribute of the table
data['content'] = data['content'].apply(stemming)

In [75]:
data.head(5)

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,hou dem aid even see comey letter jason chaffe...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,flynn hillari clinton big woman campu breitbar...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fire consortiumnew com
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,civilian kill singl us airstrik identifi jessi...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jail fiction unpublish stori wom...


In [76]:
X= data['content'].values
Y= data['label'].values

In [80]:
print(X.shape)


(20800,)


In [81]:
print(Y.shape)

(20800,)


In [87]:
for i in range(len(X)):
    print(X[i],Y[i])


hou dem aid even see comey letter jason chaffetz tweet darrel lucu 1
flynn hillari clinton big woman campu breitbart daniel j flynn 0
truth might get fire consortiumnew com 1
civilian kill singl us airstrik identifi jessica purkiss 1
iranian woman jail fiction unpublish stori woman stone death adulteri howard portnoy 1
jacki mason hollywood would love trump bomb north korea lack tran bathroom exclu video breitbart daniel nussbaum 0
life life luxuri elton john favorit shark pictur stare long transcontin flight 1
beno hamon win french socialist parti presidenti nomin new york time alissa j rubin 0
excerpt draft script donald trump q ampa black church pastor new york time 0
back channel plan ukrain russia courtesi trump associ new york time megan twohey scott shane 0
obama organ action partner soro link indivi disrupt trump agenda aaron klein 0
bbc comedi sketch real housew isi cau outrag chri tomlinson 0
russian research discov secret nazi militari base treasur hunter arctic photo amando

In [113]:
# now we will convert the textual data to numerical data using the tfidf vectorizer
vectorizer = TfidfVectorizer()

vectorizer.fit(X)

X = vectorizer.transform(X)

AttributeError: 'csr_matrix' object has no attribute 'lower'

In [129]:
print(X.shape)
print(Y.shape)

(20800, 16984)
(20800,)


In [130]:
# now we train our model after splitting the data 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)



In [131]:
model = LogisticRegression()
print(y_train.shape)
print(X_train.shape)

(16640,)
(16640, 16984)


In [132]:
# fitting the model to out data.
model.fit(X_train,y_train)

In [137]:
# predicting the outcomes
model_prediction = model.predict(X_test)

In [138]:
for i in range(len(model_prediction)):
    if(model_prediction[i] == Y_test[i]):
        print(1)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [139]:
accuracy_value = accuracy_score(model_prediction,y_test)

In [141]:
print(accuracy_value*100)

97.90865384615385
