In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import patsy
from collections import Counter
from textblob import TextBlob

from scipy import stats
import statsmodels.api as sm
from scipy.stats import ttest_ind
from matplotlib import rcParams

fake_df = pd.DataFrame.from_csv("fake.csv")
real_df = pd.DataFrame.from_csv("../uci-news-aggregator.csv")

fake_num_rows = fake_df.shape
print(fake_num_rows)

real_num_rows = real_df.shape
print(real_num_rows)

# df.head(100)
# print(df.dtypes)

(12999, 19)
(422419, 7)


In [2]:
real_df = real_df.head(12999)
print(real_df)

                                                   TITLE  \
ID                                                         
1      Fed official says weak data caused by weather,...   
2      Fed's Charles Plosser sees high bar for change...   
3      US open: Stocks fall after Fed official hints ...   
4      Fed risks falling 'behind the curve', Charles ...   
5      Fed's Plosser: Nasty Weather Has Curbed Job Gr...   
6      Plosser: Fed May Have to Accelerate Tapering Pace   
7              Fed's Plosser: Taper pace may be too slow   
8      Fed's Plosser expects US unemployment to fall ...   
9      US jobs growth last month hit by weather:Fed P...   
10     ECB unlikely to end sterilisation of SMP purch...   
11     ECB unlikely to end sterilization of SMP purch...   
12                 EU's half-baked bank union could work   
13          Europe reaches crunch point on banking union   
14     ECB FOCUS-Stronger euro drowns out ECB's messa...   
15            EU aims for deal on tackli

In [3]:
counts_by_type = fake_df['type'].value_counts()
print(counts_by_type)

bs            11492
bias            443
conspiracy      430
hate            246
satire          146
state           121
junksci         102
fake             19
Name: type, dtype: int64


In [4]:
counts_by_url = fake_df['main_img_url'].value_counts()
# print(counts_by_url)

In [5]:
# print(fake_df['spam_score'])

In [6]:
counts_by_url = fake_df['site_url'].value_counts()
# print(counts_by_url)

In [7]:
counts_of_spam = fake_df['spam_score'].value_counts()
# print(counts_of_spam)

In [8]:
col_names = fake_df.columns.tolist()
# print(col_names)

In [9]:
counts_by_replies = fake_df['replies_count'].value_counts()
# print(counts_by_replies)

In [10]:
counts_by_author = fake_df['author'].value_counts()
# print(counts_by_author)

In [11]:
counts_by_domain_rank = fake_df['domain_rank'].value_counts()
# print(counts_by_domain_rank)

In [12]:
# PROJECT IDEA(S)
# take ~10000 known fake
# take ~10000 known real
# combine and take ~25% to put in holdout set - do not use to model - use as verifier of model
# feature extraction - 
# can have 10 different metrics for exclamation marks: 
# total number of exclamation marks per 
# 
# look at number of key words: "outrageous", "strong words"
# Q: how strong is the strongest word
# unique word count - word frequency
# columns: fake / not fake, trustworthiness of source, strength of strongest word found in given article, 


In [13]:
# create new "id" column in df 
# reorder column names, setting "id" as first column and delete "uuid" col 
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')
fake_df = fake_df[['site_url', 'domain_rank', 'author', 'published', 'title', 'thread_title', 'text', 'ord_in_thread', 'crawled', 'country', 'language', 'spam_score', 'main_img_url', 'replies_count', 'participants_count', 'likes', 'comments', 'shares', 'type']]
print(fake_df.shape)

(12999, 19)


In [14]:
# count total number of exclamation marks in the given string
def count_total_exclams(string):
    exclam = '!'
    num_exclams = string.count(exclam)
    return num_exclams

In [15]:
# compute the ratio of exclams to question marks + periods in the given string
def exclam_ratio(string):
    exclam = '!'
    period = '.'
    question = '?'
    num_exclams = string.count(exclam)
    num_period = string.count(period)
    num_question = string.count(question)
    if num_period + num_question == 0:
        return num_exclams
    return num_exclams / (num_period + num_question)

In [16]:
# create new empty column for total_exclam_in_title count
fake_df.assign(total_exclam_in_title=0)
  
# REMOVE ROWS THAT HAVE NAN thread_title

fake_df = fake_df[fake_df['thread_title'].notnull()]
print(len(fake_df))

# correct id labels
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')

for i in range(1, len(fake_df) + 1):
    #title = fake_df.loc[i, 'title']
    thread_title = fake_df.loc[i, 'thread_title']
    count = count_total_exclams(thread_title)
    fake_df.set_value(i, 'total_exclam_in_title', count)

counts_by_title_exclams = fake_df.total_exclam_in_title.value_counts()
print(counts_by_title_exclams)

12987
0.0    12308
1.0      581
2.0       62
3.0       26
4.0        6
6.0        2
9.0        1
7.0        1
Name: total_exclam_in_title, dtype: int64


In [17]:
# create new empty column for total_exclam_in_text count
fake_df.assign(total_exclam_in_text=0)
  
# REMOVE ROWS THAT HAVE NAN thread_title
fake_df = fake_df[fake_df['text'].notnull()]
print(len(fake_df))

# correct id labels
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')

for i in range(1, len(fake_df) + 1):
    text = fake_df.loc[i, 'text']
    count = count_total_exclams(text)
    fake_df.set_value(i, 'total_exclam_in_text', count)

counts_by_text_exclams = fake_df.total_exclam_in_text.value_counts()
# print(counts_by_text_exclams)

12941


In [18]:
# create new empty column for total_exclam_in_text count
fake_df.assign(ratio_exclams_in_text=0)

# compute the ratio of exclamation marks to other sentence terminating punctionation
# and store in column "ratio_exclams_in_text"
for i in range(1, len(fake_df) + 1):
    text = fake_df.loc[i, 'text']
    count = exclam_ratio(text)
    fake_df.set_value(i, 'ratio_exclams_in_text', count)

counts_ratio_exclams = fake_df.ratio_exclams_in_text.value_counts()
print(counts_ratio_exclams)

0.000000    9263
1.000000     103
0.333333      89
0.166667      78
0.250000      77
0.076923      72
0.111111      70
0.125000      67
0.142857      67
0.083333      66
0.066667      57
0.058824      57
0.055556      57
0.200000      57
0.100000      52
0.090909      52
0.071429      51
0.045455      49
0.062500      48
0.043478      44
0.047619      43
0.500000      43
0.050000      41
0.052632      41
0.031250      37
0.150000      37
0.037037      35
0.038462      35
0.033333      32
0.222222      30
            ... 
0.037594       1
0.016260       1
0.049505       1
0.146341       1
0.056180       1
0.046414       1
0.048193       1
0.065574       1
0.181452       1
0.047170       1
0.041420       1
0.021429       1
0.205128       1
0.034843       1
0.013423       1
0.071942       1
0.084507       1
0.056133       1
0.065789       1
0.019011       1
0.192000       1
0.006250       1
0.260870       1
0.086662       1
0.003521       1
0.030000       1
0.020548       1
0.013850      

In [19]:
# create series of total exclamation counts in each row's title
# for index, row in df.iterrows():
#     count = count_total_exclamation(row.title)
#     print(count)
#     df.loc[:,'total_crime'] = df.apply(get_total_crime, axis=1)
#     df.loc[index, row.total_exclam_in_title] = count


# df.loc[:, 'total_exclam_in_title'] = df.apply(count_total_exclams, axis=1)    
# count_title_exclams = df['total_exclam_in_title'].value_counts()
# print(count_title_exclams)

In [20]:
# make  copy of the fake_df containing only the thread_title & site_url
sub_fake_df = fake_df[['thread_title', 'site_url']].copy()

# replace all carriage returns and tabs with spaces
for i in range(1, len(sub_fake_df) + 1):
    title = sub_fake_df.loc[i, 'thread_title']
    title = title.split("\n")
    title = " ".join(title)
    title = title.split("\t")
    title = " ".join(title)
    sub_fake_df.set_value(i, 'thread_title', title)

# replace all carriage returns and tabs with spaces    
for i in range(1, len(sub_fake_df) + 1):
    url = sub_fake_df.loc[i, 'site_url']
    url = url.split("\n")
    url = " ".join(url)
    url = url.split("\t")
    url = " ".join(url)
    sub_fake_df.set_value(i, 'site_url', url)
    
print(sub_fake_df)

# turn all tabs into spaces
# x = "The bananas are yellow and green"
# x = x.split(" ")
# print(x)
# x = "+".join(x)
# print(x)

                                            thread_title             site_url
id                                                                           
1      Muslims BUSTED: They Stole Millions In Gov’t B...  100percentfedup.com
2      Re: Why Did Attorney General Loretta Lynch Ple...  100percentfedup.com
3      BREAKING: Weiner Cooperating With FBI On Hilla...  100percentfedup.com
4      PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...  100percentfedup.com
5      FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...  100percentfedup.com
6      Hillary Goes Absolutely Berserk On Protester A...  100percentfedup.com
7      BREAKING! NYPD Ready To Make Arrests In Weiner...  100percentfedup.com
8      WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...  100percentfedup.com
9      BREAKING: CLINTON CLEARED...Was This A Coordin...  100percentfedup.com
10     EVIL HILLARY SUPPORTERS Yell "F*ck Trump"…Burn...  100percentfedup.com
11     YIKES! HILLARY GOES OFF THE RAILS…Pulls A Howa...  100per

In [28]:
sub_real_df = real_df[['TITLE', 'URL']].copy()
sub_real_df = sub_real_df.head(12941)

# correct id labels
sub_real_df['id'] = range(1, len(sub_real_df) + 1)
sub_real_df = sub_real_df.set_index('id')

# replace all carriage returns and tabs with spaces
for i in range(1, len(sub_real_df) + 1):
    title = sub_real_df.loc[i, 'TITLE']
    title = title.split("\n")
    title = " ".join(title)
    title = title.split("\t")
    title = " ".join(title)
    sub_real_df.set_value(i, 'TITLE', title)

# replace all carriage returns and tabs with spaces    
for i in range(1, len(sub_real_df) + 1):
    url = sub_real_df.loc[i, 'URL']
    url = url.split("\n")
    url = " ".join(url)
    url = url.split("\t")
    url = " ".join(url)
    sub_real_df.set_value(i, 'URL', url)

In [33]:
# create new column, 'TARGET' with 1 fake and 0 for real
sub_fake_df['TARGET'] = 1
print(sub_fake_df)
sub_fake_df = sub_fake_df.rename(columns = {'thread_title':'TITLE', 'site_url':'URL'})

sub_real_df['TARGET'] = 0
print(sub_real_df)

# combine the two dataframes
combined_df = sub_fake_df.append(sub_real_df)
print(combined_df)

                                                   TITLE                  URL  \
id                                                                              
1      Muslims BUSTED: They Stole Millions In Gov’t B...  100percentfedup.com   
2      Re: Why Did Attorney General Loretta Lynch Ple...  100percentfedup.com   
3      BREAKING: Weiner Cooperating With FBI On Hilla...  100percentfedup.com   
4      PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...  100percentfedup.com   
5      FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...  100percentfedup.com   
6      Hillary Goes Absolutely Berserk On Protester A...  100percentfedup.com   
7      BREAKING! NYPD Ready To Make Arrests In Weiner...  100percentfedup.com   
8      WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...  100percentfedup.com   
9      BREAKING: CLINTON CLEARED...Was This A Coordin...  100percentfedup.com   
10     EVIL HILLARY SUPPORTERS Yell "F*ck Trump"…Burn...  100percentfedup.com   
11     YIKES! HILLARY GOES O

In [34]:
# convert combined_df into a new csv
combined_df.to_csv("cleaned_combined_dataset.csv")