In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
print (stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Since the dataset was huge I was not able to preprocess it completely by Panda. I manually using excel functions removed all the empty and nan rows, Also fixed the constraints for the value in each column. I am now loading the preprocessed data


In [None]:
#loading pre processed dataset into dataframe
train_dataset = pd.read_csv('/content/train excel.csv')

In [None]:
train_dataset.shape

(18208, 5)

In [None]:
train_dataset.head()


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
train_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [None]:
#Merging the title, author, and text column together
train_dataset['content'] = train_dataset['title'] + ' ' + train_dataset['author'] + ' ' + train_dataset['text']

In [None]:
print(train_dataset['content'])

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2        Why the Truth Might Get You Fired Consortiumne...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
18203    Rapper T.I.: Trump a ’Poster Child For White S...
18204    N.F.L. Playoffs: Schedule, Matchups and Odds -...
18205    Macy’s Is Said to Receive Takeover Approach by...
18206    NATO, Russia To Hold Parallel Exercises In Bal...
18207    What Keeps the F-35 Alive David Swanson   Davi...
Name: content, Length: 18208, dtype: object


In [None]:
x = train_dataset.drop(columns = 'label', axis = 1)
y = train_dataset['label']

In [None]:
lematizer = WordNetLemmatizer()

In [None]:
def lemmatizing(content):
    lem_content = re.sub('[^a-zA-Z]', ' ', content)        # Remove non-alphabet characters
    lem_content = lem_content.lower()                      # Lowercase
    lem_content = lem_content.split()                      # Tokenize
    lem_content = [lematizer.lemmatize(word)               # Lemmatize
                   for word in lem_content
                   if not word in stopwords.words('english')]  # Remove stopwords
    lem_content = ' '.join(lem_content)                    # Join back to string
    return lem_content

In [None]:
train_dataset['content'] = train_dataset['content'].apply(lemmatizing)

In [None]:
print(train_dataset['content'])

0        house dem aide even see comey letter jason cha...
1        flynn hillary clinton big woman campus breitba...
2        truth might get fired consortiumnews com truth...
3        civilian killed single u airstrike identified ...
4        iranian woman jailed fictional unpublished sto...
                               ...                        
18203    rapper trump poster child white supremacy jero...
18204    n f l playoff schedule matchup odds new york t...
18205    macy said receive takeover approach hudson bay...
18206    nato russia hold parallel exercise balkan alex...
18207    keep f alive david swanson david swanson autho...
Name: content, Length: 18208, dtype: object


In [None]:
x = train_dataset['content'].values
y = train_dataset['label'].values

In [None]:
print (x)

['house dem aide even see comey letter jason chaffetz tweeted darrell lucus house dem aide even see comey letter jason chaffetz tweeted darrell lucus october subscribe jason chaffetz stump american fork utah image courtesy michael jolley available creative common license apology keith olbermann doubt worst person world week fbi director james comey according house democratic aide look like also know second worst person well turn comey sent infamous letter announcing fbi looking email may related hillary clinton email server ranking democrat relevant committee hear comey found via tweet one republican committee chairman know comey notified republican chairman democratic ranking member house intelligence judiciary oversight committee agency reviewing email recently discovered order see contained classified information long letter went oversight committee chairman jason chaffetz set political world ablaze tweet fbi dir informed fbi learned existence email appear pertinent investigation ca

In [None]:
x.shape

(18208,)

In [None]:
print(y)

[1 0 1 ... 0 1 1]


In [None]:
y.shape

(18208,)

In [None]:
#converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)

In [None]:
print (x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4895990 stored elements and shape (18208, 127043)>
  Coords	Values
  (0, 337)	0.04350669584952562
  (0, 598)	0.03501282059439111
  (0, 772)	0.03835184693663968
  (0, 1083)	0.02424733787994068
  (0, 1149)	0.017126013123730084
  (0, 1620)	0.02421007957830971
  (0, 2003)	0.01768472599497188
  (0, 2306)	0.13619893351938941
  (0, 3476)	0.04579559364978506
  (0, 3495)	0.018344094224462596
  (0, 3869)	0.011502472767929815
  (0, 4283)	0.04177478284928586
  (0, 4735)	0.029326416092614983
  (0, 4860)	0.02016922700502441
  (0, 4919)	0.026272891197791477
  (0, 5213)	0.017531069635464802
  (0, 5394)	0.031300501481100154
  (0, 5449)	0.04563139868477495
  (0, 5459)	0.021289625477937342
  (0, 7359)	0.019989680093978245
  (0, 7985)	0.020682278074052814
  (0, 9710)	0.024315007149550073
  (0, 10248)	0.015466430667164178
  (0, 12143)	0.04776383810942951
  (0, 13721)	0.028880478559500163
  :	:
  (18207, 122794)	0.01831621949650066
  (18207, 1228

In [None]:
#splitting the dataset into training and text data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)

In [None]:
#Training the logistic regression model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

In [None]:
#training accuracy
train_predict = model.predict(x_train)
train_accuracy = accuracy_score(train_predict, y_train)
print(train_accuracy)

0.9785802553892626


In [None]:
test_predict = model.predict(x_test)
test_accuracy = accuracy_score(test_predict, y_test)
print(test_accuracy)

0.9634816035145525
