In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adithya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
#Checking for Null values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
#We will replace the null values with blanks

df.fillna('',inplace = True)

In [7]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [8]:
# We will do our analysis on author text and label so i was dropping the id and title
df.drop(['id','title'],axis = 1,inplace = True)

In [9]:
# We will combine author and text and make them a single column

df['content'] = df['author']+ ' '+ df['text']

In [10]:
df.head()

Unnamed: 0,author,text,label,content
0,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Daniel J. Flynn Ever get the feeling your life...
2,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss Videos 15 Civilians Killed In ...
4,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Print \nAn Iranian woman has be...


In [11]:
# Dropping the author and text column
df.drop(['author','text'],axis = 1,inplace = True)

In [12]:
df.head()

Unnamed: 0,label,content
0,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,0,Daniel J. Flynn Ever get the feeling your life...
2,1,Consortiumnews.com Why the Truth Might Get You...
3,1,Jessica Purkiss Videos 15 Civilians Killed In ...
4,1,Howard Portnoy Print \nAn Iranian woman has be...


In [13]:
df.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [14]:
# separating the data & label
X = df.content
Y = df.label

In [15]:
print(X)
print(Y)

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn Ever get the feeling your life...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss Videos 15 Civilians Killed In ...
4        Howard Portnoy Print \nAn Iranian woman has be...
                               ...                        
20795    Jerome Hudson Rapper T. I. unloaded on black c...
20796    Benjamin Hoffman When the Green Bay Packers lo...
20797    Michael J. de la Merced and Rachel Abrams The ...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799    David Swanson   David Swanson is an author, ac...
Name: content, Length: 20800, dtype: object
0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


Stemming:

Stemming is the process of reducing a word to its Root word

example:
actor, actress, acting --> act

In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
df['content'] = df['content'].apply(stemming)

In [34]:
print(df['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn ever get feel life circl rounda...
2        consortiumnew com truth might get fire octob t...
3        jessica purkiss video civilian kill singl us a...
4        howard portnoy print iranian woman sentenc six...
                               ...                        
20795    jerom hudson rapper unload black celebr met do...
20796    benjamin hoffman green bay packer lost washing...
20797    michael j de la merc rachel abram maci today g...
20798    alex ansari nato russia hold parallel exercis ...
20799    david swanson david swanson author activist jo...
Name: content, Length: 20800, dtype: object


In [114]:
X = df.content.values
Y = df.label.values

In [115]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know anthoni weiner sext teenag appar littl thing f

In [116]:
Y.shape

(20800,)

In [117]:
# converting the textual data to numerical data
vectorizer= TfidfVectorizer(max_features = 5000)
vectorizer.fit(X)

X = vectorizer.transform(X)



Splitting the dataset to training & test data

In [118]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, stratify=Y, random_state=2)

# Training model using Naive bayes classifier

In [119]:


from sklearn.naive_bayes import MultinomialNB
Fake_detect_model = MultinomialNB().fit(X_train, Y_train)

In [120]:
y_pred=Fake_detect_model.predict(X_test)

In [121]:
print(classification_report(y_pred,Y_test))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90      2741
           1       0.87      0.93      0.90      2459

    accuracy                           0.90      5200
   macro avg       0.90      0.90      0.90      5200
weighted avg       0.90      0.90      0.90      5200



In [122]:
print(confusion_matrix(y_pred,Y_test))

[[2414  327]
 [ 183 2276]]


In [123]:
print(accuracy_score(y_pred,Y_test))

0.9019230769230769


# Training model using Logistic Regression

In [124]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train,Y_train)

y1_pred = lr_model.predict(X_test)

In [125]:
print(classification_report(y1_pred,Y_test))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      2564
           1       0.95      0.94      0.95      2636

    accuracy                           0.94      5200
   macro avg       0.94      0.94      0.94      5200
weighted avg       0.94      0.94      0.94      5200



In [126]:
print(confusion_matrix(y1_pred,Y_test))

[[2437  127]
 [ 160 2476]]


In [127]:
print(accuracy_score(y1_pred,Y_test))

0.9448076923076923


# Training model using Random Forest

In [132]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)

rf.fit(X_train,Y_train)
y_pred2 = rf.predict(X_test)


In [133]:
print(classification_report(y_pred2,Y_test))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      2676
           1       0.93      0.96      0.95      2524

    accuracy                           0.95      5200
   macro avg       0.95      0.95      0.95      5200
weighted avg       0.95      0.95      0.95      5200



In [134]:
print(confusion_matrix(y1_pred,Y_test))

[[2437  127]
 [ 160 2476]]


In [135]:
print(accuracy_score(y1_pred,Y_test))

0.9448076923076923


# Passive Aggressive Classifier Algorithm

In [143]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=100)

In [144]:
linear_clf.fit(X_train, Y_train)
pred = linear_clf.predict(X_test)



In [145]:
print(classification_report(pred,Y_test))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2601
           1       0.95      0.95      0.95      2599

    accuracy                           0.95      5200
   macro avg       0.95      0.95      0.95      5200
weighted avg       0.95      0.95      0.95      5200



In [146]:
print(confusion_matrix(pred,Y_test))

[[2469  132]
 [ 128 2471]]


In [147]:
print(accuracy_score(pred,Y_test))

0.95


In [150]:
import pickle
pickle.dump(linear_clf,open('/content/drive/MyDrive/Colab Notebooks/Fake News Classifier/fake_classifier.pkl','wb'))

In [7]:
import pickle
loaded_model = pickle.load(open('fake_classifier.pkl', 'rb'))



In [9]:
sentence = 'If at first you donâ€™t succeed, try a different sport. Tim Tebow, who was a Heisman   quarterback at the University of Florida but was unable to hold an N. F. L. job, is pursuing a career in Major League Baseball. He will hold a workout for M. L. B. teams this month, his agents told ESPN and other news outlets. â€œThis may sound like a publicity stunt, but nothing could be further from the truth,â€ said Brodie Van Wagenen,   of CAA Baseball, part of the sports agency CAA Sports, in the statement. â€œI have seen Timâ€™s workouts, and people inside and outside the industry  â€”   scouts, executives, players and fans  â€”   will be impressed by his talent. â€ Itâ€™s been over a decade since Tebow, 28, has played baseball full time, which means a comeback would be no easy task. But the former major league catcher Chad Moeller, who said in the statement that he had been training Tebow in Arizona, said he was â€œbeyond impressed with Timâ€™s athleticism and swing. â€ â€œI see bat speed and power and real baseball talent,â€ Moeller said. â€œI truly believe Tim has the skill set and potential to achieve his goal of playing in the major leagues and based on what I have seen over the past two months, it could happen relatively quickly. â€ Or, take it from Gary Sheffield, the former   outfielder. News of Tebowâ€™s attempted comeback in baseball was greeted with skepticism on Twitter. As a junior at Nease High in Ponte Vedra, Fla. Tebow drew the attention of major league scouts, batting . 494 with four home runs as a left fielder. But he ditched the bat and glove in favor of pigskin, leading Florida to two national championships, in 2007 and 2009. Two former scouts for the Los Angeles Angels told WEEI, a Boston radio station, that Tebow had been under consideration as a high school junior. â€œâ€™x80â€™x9cWe wanted to draft him, â€™x80â€™x9cbut he never sent back his information card,â€ said one of the scouts, Tom Kotchman, referring to a questionnaire the team had sent him. â€œHe had a strong arm and had a lot of power,â€ said the other scout, Stephen Hargett. â€œIf he would have been there his senior year he definitely would have had a good chance to be drafted. â€ â€œIt was just easy for him,â€ Hargett added. â€œYou thought, If this guy dedicated everything to baseball like he did to football how good could he be?â€ Tebowâ€™s high school baseball coach, Greg Mullins, told The Sporting News in 2013 that he believed Tebow could have made the major leagues. â€œHe was the leader of the team with his passion, his fire and his energy,â€ Mullins said. â€œHe loved to play baseball, too. He just had a bigger fire for football. â€ Tebow wouldnâ€™t be the first athlete to switch from the N. F. L. to M. L. B. Bo Jackson had one   season as a Kansas City Royal, and Deion Sanders played several years for the Atlanta Braves with mixed success. Though Michael Jordan tried to cross over to baseball from basketball as a    in 1994, he did not fare as well playing one year for a Chicago White Sox minor league team. As a football player, Tebow was unable to match his college success in the pros. The Denver Broncos drafted him in the first round of the 2010 N. F. L. Draft, and he quickly developed a reputation for clutch performances, including a memorable   pass against the Pittsburgh Steelers in the 2011 Wild Card round. But his stats and his passing form werenâ€™t pretty, and he spent just two years in Denver before moving to the Jets in 2012, where he spent his last season on an N. F. L. roster. He was cut during preseason from the New England Patriots in 2013 and from the Philadelphia Eagles in 2015.'

In [10]:
port_stem = PorterStemmer()

In [11]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [12]:
sentence = stemming(sentence)

In [13]:
tfid_transformer = pickle.load(open('transformer.pkl','rb'))
sentence = stemming(sentence)



In [14]:
sentence

'first succeed tri differ sport tim tebow heisman quarterback univ florida unabl hold n f l job pursu career major leagu baseb hold workout l b team month agent told espn news outlet may sound like public stunt noth could truth said brodi van wagenen caa baseb part sport agenc caa sport statement seen tim workout peopl insid outsid industri scout execut player fan impress talent decad sinc tebow play baseb full time mean comeback would easi task former major leagu catcher chad moeller said statement train tebow arizona said beyond impress tim athlet swing see bat speed power real baseb talent moeller said truli believ tim skill set potenti achiev goal play major leagu base seen past two month could happen rel quickli take gari sheffield former outfield news tebow attempt comeback baseb greet skeptic twitter junior nea high pont vedra fla tebow drew attent major leagu scout bat four home run left fielder ditch bat glove favor pigskin lead florida two nation championship two former scout

In [15]:
sentence = tfid_transformer.transform([sentence])

In [16]:
sentence

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 183 stored elements in Compressed Sparse Row format>

In [17]:
loaded_model.predict(sentence)

array([0], dtype=int64)