# CS 451 Project Preliminary
## FactOrFake : Fake News Detection Using Machine Learning Algorithms 

### 1. Data Collection 

#### Data Sets Used:
1. [FakeNewsNet](https://www.kaggle.com/mdepak/fakenewsnet)
2. [PHEME Dataset for Rumour Detection](https://figshare.com/articles/dataset/PHEME_dataset_for_Rumour_Detection_and_Veracity_Classification/63920787)

In [1]:
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df_poltifake = pd.read_csv("Datasets\FakeNewsNet\dataset\politifact_fake.csv")
df_poltifake.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...


In [3]:
df_poltifake.shape

(432, 4)

In [4]:
df_poltifake['news_url'].nunique()

428

In [5]:
df_poltifake['target'] = 0

In [6]:
df_poltifake.head()

Unnamed: 0,id,news_url,title,tweet_ids,target
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0


In [7]:
df_poltifake.drop(df_poltifake.columns[[0,3]],axis=1,inplace=True)

In [8]:
df_poltifake.head()

Unnamed: 0,news_url,title,target
0,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,0
1,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,0
2,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,0
3,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,0
4,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,0


In [9]:
df_poltifake.dropna()
df_poltifake.shape

(432, 3)

In [10]:
df = pd.read_csv("Datasets\FakeNewsNet\dataset\politifact_real.csv")
df.dropna(inplace=True)
df['target'] = 1
df.drop(df.columns[[0,3]],axis=1,inplace=True)
df.head()

Unnamed: 0,news_url,title,target
0,http://www.nfib-sbet.org/,National Federation of Independent Business,1
1,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,1
4,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",1
5,http://www.politifact.com/truth-o-meter/statem...,Donald Trump exaggerates when he says China ha...,1
6,https://www.law.cornell.edu/constitution/amend...,25th Amendment,1


In [11]:
df.shape

(373, 3)

In [12]:
df_combined = pd.concat([df_poltifake,df])
df_combined.shape

(805, 3)

In [13]:
df = pd.read_csv("Datasets\FakeNewsNet\dataset\gossipcop_real.csv")
df.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,gossipcop-882573,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,912371411146149888\t912371528343408641\t912372...
1,gossipcop-875924,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,901989917546426369\t901989992074969089\t901990...
2,gossipcop-894416,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,931263637246881792\t931265332022579201\t931265...
3,gossipcop-857248,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,868114761723936769\t868122567910936576\t868128...
4,gossipcop-884684,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,915528047004209152\t915529285171122176\t915530...


In [14]:
df.dropna(inplace=True)
df['target'] = 1
df.drop(df.columns[[0,3]],axis=1,inplace=True)
df.head()

Unnamed: 0,news_url,title,target
0,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,1
1,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,1
2,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,1
3,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,1
4,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,1


In [15]:
df.shape

(15747, 3)

In [16]:
df_combined = pd.concat([df_combined,df])
df_combined.shape

(16552, 3)

In [17]:
df = pd.read_csv("Datasets\FakeNewsNet\dataset\gossipcop_real.csv")
df['target'] = 1
df.drop(df.columns[[0,3]],axis=1,inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,news_url,title,target
0,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,1
1,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,1
2,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,1
3,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,1
4,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,1


In [18]:
df_combined = pd.concat([df_combined,df])
df_fakenewsnet = df_combined
df_fakenewsnet.shape

(33356, 3)

In [19]:
df_fakenewsnet.head()

Unnamed: 0,news_url,title,target
0,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,0
1,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,0
2,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,0
3,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,0
4,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,0


In [19]:
df.to_csv("combined_dataset.csv")
# df_combined=pd.read_csv("combined_dataset.csv")
# df_combined.shape

In [23]:
# Firstly, fill all the null spaces with a space
# train = df_fakenewsnet.fillna(' ')
train['total'] = train['news_url']+train['title']
train.head()

Unnamed: 0,news_url,title,target,total
0,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,0,speedtalk.com/forum/viewtopic.php?t=51650BREAK...
1,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,0,politics2020.info/index.php/2018/03/13/court-o...
2,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,0,www.nscdscamps.org/blog/category/parenting/467...
3,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,0,https://howafrica.com/oscar-pistorius-attempts...
4,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,0,http://washingtonsources.org/trump-votes-for-d...


In [25]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [26]:
stop_words = stopwords.words('english')

In [27]:
lemmatizer = WordNetLemmatizer()

In [28]:
# nltk.download('wordnet')

In [29]:
for index, row in train.iterrows():
    filter_sentence = ''
    sentence = row['total']
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for word in words:
        filter_sentence = filter_sentence  + ' ' +str(lemmatizer.lemmatize(word)).lower()
    train.loc[index, 'total'] = filter_sentence
train = train[['total', 'target']]

In [26]:
train.head(50)

Unnamed: 0,total,target
0,teen mom star jenelle evans ' wedding dress i...,0
1,kylie jenner refusing discus tyga life kylie,0
2,quinn perkins,0
3,i tried kim kardashian 's butt workout & am f...,0
4,celine dion donates concert proceeds vegas sh...,0
5,"chris evans , millie bobby brown , snoop dogg...",0
6,handmaid 's tale renewed season 3,0
7,a complete timeline selena gomez justin biebe...,0
8,when will ‘ claws ’ season 2 be on hulu ?,0
9,critics ' choice awards - critics ' choice aw...,0
