# W207 Group Project: Real vs Fake News 
#### Members: Akiko Iwamizu, Allison Fox, Jason Yang, Rohin Chabra

## Import & Clean Data

In [31]:
# Load libraries.
import pandas as pd
import numpy as np
import zipfile

from sklearn.model_selection import train_test_split

In [57]:
!pwd

/Users/akikoiwamizu/Documents/Documents - Akiko’s iMac 27"/GitHub/w207-project


In [15]:
# Read in fake news data file.
fake_zip = zipfile.ZipFile("input/fake-and-real-news-dataset/Fake.csv.zip") 
fake = pd.read_csv(fake_zip.open("Fake.csv"))
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [16]:
# Read in true news data file.
true_zip = zipfile.ZipFile("input/fake-and-real-news-dataset/True.csv.zip") 
true = pd.read_csv(true_zip.open("True.csv"))
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [87]:
# Create label columns for the fake news data set.
fake["label"] = "false"
fake["label_bin"] = 0
fake.shape

(23481, 6)

In [88]:
# Create label columns for the true news data set.
true["label"] = "true"
true["label_bin"] = 1
true.shape

(21417, 6)

In [89]:
# Merge + combine data sets into one.
data = [fake, true]
df = pd.concat(data, ignore_index=True)
df.shape

(44898, 6)

In [90]:
# Set the randomizer seed so results are the same each time.
np.random.seed(0)

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      44898 non-null  object
 1   text       44898 non-null  object
 2   subject    44898 non-null  object
 3   date       44898 non-null  object
 4   label      44898 non-null  object
 5   label_bin  44898 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.1+ MB


In [92]:
# Confirm the population of each column.
print(f"Empty records:\n{df.isna().sum()}")

Empty records:
title        0
text         0
subject      0
date         0
label        0
label_bin    0
dtype: int64


## Split & Export Data

In [93]:
# Split data sets into train (70%), test (15%), & dev (15%).
# Use random_state param to reproduce shuffle outcomes.
train, test = train_test_split(df, random_state=0, train_size=0.7, test_size=0.3, shuffle=True)
num_test = int(len(test)/2)
test = test[num_test:]
dev = test[:num_test]

print("Train data shape:", train.shape)
print("Test data shape:", test.shape)
print("Dev data shape:", dev.shape)

Train data shape: (31428, 6)
Test data shape: (6735, 6)
Dev data shape: (6735, 6)


In [95]:
# Export train, test, & dev data sets to dir.
# Write a Pandas DataFrame as a zipped CSV file.
train.to_csv("input/processed-datasets/train.csv", index=False)
test.to_csv("input/processed-datasets/test.csv", index=False)
dev.to_csv("input/processed-datasets/dev.csv", index=False)

In [76]:
!ls -lrth input/processed-datasets/

total 0


## Exploratory Data Analysis: Test Data

In [63]:
# Look at a sample of the test data.
test.head()

Unnamed: 0,title,text,subject,date,label,label_bin
26295,Senate may vote on revised healthcare bill nex...,WASHINGTON (Reuters) - U.S. Senate Republicans...,politicsNews,"July 11, 2017",true,1
19917,LOL! Leftist CA Congresswoman On Tonight’s Deb...,The Democrats are in full panic-mode over Croo...,left-news,"Sep 26, 2016",fake,0
2307,Trump Feels The Fury Of Twitter After Compari...,After Donald Trump s Saturday morning tweetsto...,News,"March 4, 2017",fake,0
15936,Bye Bye Cowboys! Crowd Boos As Owner Jerry Jon...,The Dallas Cowboys tried to have it both ways ...,Government News,"Sep 25, 2017",fake,0
31925,Clinton leads Trump by six points in latest Re...,NEW YORK (Reuters) - Democratic presidential n...,politicsNews,"August 16, 2016",true,1


In [64]:
# Create a list of features.
features = [feature for feature in test.columns if feature not in ["label", "label_bin"]]
print(features)

['title', 'text', 'subject', 'date']
