# World News NLP Project: Notebook 1
## Data Imports, Cleaning & EDA
#### Adam Zucker

---

## Problem

Statement

---

## Notebook Contents

Imports, cleaning, EDA, basic background...

---

## Data

Data Dict?

---

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy import displacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from geotext import GeoText

import time
from datetime import datetime

In [20]:
# Reading in data
df = pd.read_csv('../data/world_news_posts.csv')

In [3]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


---

## EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   time_created  509236 non-null  int64 
 1   date_created  509236 non-null  object
 2   up_votes      509236 non-null  int64 
 3   down_votes    509236 non-null  int64 
 4   title         509236 non-null  object
 5   over_18       509236 non-null  bool  
 6   author        509236 non-null  object
 7   category      509236 non-null  object
dtypes: bool(1), int64(3), object(4)
memory usage: 27.7+ MB


In [5]:
# Checking for nulls in the dataframe - none detected
df.isnull().sum()

time_created    0
date_created    0
up_votes        0
down_votes      0
title           0
over_18         0
author          0
category        0
dtype: int64

In [21]:
# Defining a function to concisely process this dataframe and others in the same format
def process_data(df):
    
    # Redefining the 'time_created' column to hold datetime, converted from unix timestamp format
    df['time_created'] = [datetime.fromtimestamp(ts) for ts in df['time_created']]
    # Dropping 'date_created' because of redundancy
    df.drop(columns='date_created', inplace=True)
    
    # Creating a feature to hold the post length in characters
    df['post_length_chars'] = df['title'].apply(len)
    
    # Generating features for total number of posts by author and total number of upvotes by author
    df['author_upvotes'] = 
    df['author_posts'] = 
    
    # Generating a feature to hold day of the week and dummifying
    df['weekday'] = df['time_created'].dt.day_name()
    day_dummies = pd.get_dummies(df['weekday'], drop_first=True)
    df = pd.concat([df, day_dummies], axis=1)
    df.drop('weekday', inplace=True)
    
    # Dropping 'category' feature if only one category is present
    if len(df['category'].unique()) == 1:
        df.drop(columns='category', inplace=True)
    # Similarly dropping down votes if there are none reported
    if sum(df['down_votes']) == 0:
        df.drop(columns='down_votes', inplace=True)
    
    # Binarizing 'over_18' feature
    df['over_18'] = df['over_18'].map({False:0, True:1})
    
    # Reordering columns for clarity
    # df = df[['author', 'title', 'up_votes', 'over_18', 'post_length_chars' 'time_created', 
    #          'weekday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday']]
    
    return df

In [22]:
process_data(df)

Unnamed: 0,time_created,up_votes,title,over_18,author,post_length_chars,weekday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2008-01-24 22:34:06,3,Scores killed in Pakistan clashes,0,polar,33,Thursday,0,0,0,1,0,0
1,2008-01-24 22:34:35,2,Japan resumes refuelling mission,0,polar,32,Thursday,0,0,0,1,0,0
2,2008-01-24 22:42:03,3,US presses Egypt on Gaza border,0,polar,31,Thursday,0,0,0,1,0,0
3,2008-01-24 22:54:50,1,Jump-start economy: Give health care to all,0,fadi420,44,Thursday,0,0,0,1,0,0
4,2008-01-25 10:25:20,4,Council of Europe bashes EU&UN terror blacklist,0,mhermans,47,Friday,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509231,2016-11-22 07:12:44,5,Heil Trump : Donald Trump s alt-right white...,0,nonamenoglory,88,Tuesday,0,0,0,0,1,0
509232,2016-11-22 07:12:52,1,There are people speculating that this could b...,0,SummerRay,67,Tuesday,0,0,0,0,1,0
509233,2016-11-22 07:17:36,1,Professor receives Arab Researchers Award,0,AUSharjah,41,Tuesday,0,0,0,0,1,0
509234,2016-11-22 07:19:17,1,Nigel Farage attacks response to Trump ambassa...,0,smilyflower,55,Tuesday,0,0,0,0,1,0


In [6]:
# Converting 'date_created' to datetime
df['date_created'] = pd.to_datetime(df['date_created'])

In [7]:
df.dtypes

time_created             int64
date_created    datetime64[ns]
up_votes                 int64
down_votes               int64
title                   object
over_18                   bool
author                  object
category                object
dtype: object

In [8]:
# The data spans 3223 days, from 1/25/08 to 11/22/16
print(f"Number of days represented in dataframe: {len(df['date_created'].unique())}")
print(f"Data date range is from {min(df['date_created'])} to {max(df['date_created'])}")

Number of days represented in dataframe: 3223
Data date range is from 2008-01-25 00:00:00 to 2016-11-22 00:00:00


In [9]:
# All posts are classified as 'worldnews' - with just a single class represented, this feature becomes unnecessary
df['category'].value_counts()

worldnews    509236
Name: category, dtype: int64

In [10]:
# Dropping 'category' feature
df.drop(columns='category', inplace=True)

---

In [11]:
# Summary stats for upvotes
df['up_votes'].describe()

count    509236.000000
mean        112.236283
std         541.694675
min           0.000000
25%           1.000000
50%           5.000000
75%          16.000000
max       21253.000000
Name: up_votes, dtype: float64

In [12]:
# Looking at titles of most upvoted posts
df['up_votes'].groupby(df['title']).sum().sort_values(ascending=False)[0:50].to_frame()

Unnamed: 0_level_0,up_votes
title,Unnamed: 1_level_1
"A biotech startup has managed to 3-D print fake rhino horns that carry the same genetic fingerprint as the actual horn. The company plans to flood Chinese rhino horn market at one-eighth of the price of the original, undercutting the price poachers can get and forcing them out eventually.",21253
"Twitter has forced 30 websites that archive politician s deleted tweets to shut down, removing an effective tool to keep politicians honest",13435
"2.6 terabyte leak of Panamanian shell company data reveals how a global industry led by major banks, legal firms, and asset management companies secretly manages the estates of politicians, Fifa officials, fraudsters and drug smugglers, celebrities and professional athletes.",13244
"The police officer who leaked the footage of the surfers paradise police brutality, where the victims blood was washed away by officers, has been criminally charged for bringing it to the publics view. Officers who did the bashing get nothing.",12333
Paris shooting survivor suing French media for giving away his location while he hid from shooters,11288
Hundreds of thousands of leaked emails reveal massively widespread corruption in global oil industry,11108
Brazil s Supreme Court has banned corporate contributions to political campaigns and parties,10922
"ISIS beheads 81-year-old pioneer archaeologist and foremost scholar on ancient Syria. Held captive for 1 month, he refused to tell ISIS the location of the treasures of Palmyra unto death.",10515
"Feeding cows seaweed could slash global greenhouse gas emissions, researchers say: They discovered adding a small amount of dried seaweed to a cow s diet can reduce the amount of methane a cow produces by up to 99 per cent.",10394
Brazilian radio host famous for exposing corruption in his city murdered while broadcasting live on the air by two gunmen.,10377


In [13]:
df.sort_values('up_votes', ascending=False)[0:50]

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author
377200,1434818471,2015-06-20,21253,0,A biotech startup has managed to 3-D print fak...,False,KRISHNA53
391415,1440421079,2015-08-24,13435,0,Twitter has forced 30 websites that archive po...,False,joeyoungblood
450818,1459706506,2016-04-03,13244,0,2.6 terabyte leak of Panamanian shell company ...,False,mister_geaux
391318,1440367768,2015-08-23,12333,0,The police officer who leaked the footage of t...,False,navysealassulter
390252,1439939168,2015-08-18,11288,0,Paris shooting survivor suing French media for...,False,seapiglet
449809,1459336773,2016-03-30,11108,0,Hundreds of thousands of leaked emails reveal ...,False,Xiroth
397215,1442535288,2015-09-18,10922,0,Brazil s Supreme Court has banned corporate co...,False,DoremusJessup
390494,1440030633,2015-08-20,10515,0,ISIS beheads 81-year-old pioneer archaeologist...,False,DawgsOnTopUGA
500786,1476881235,2016-10-19,10394,0,Feeding cows seaweed could slash global greenh...,False,mvea
388230,1438963135,2015-08-07,10377,0,Brazilian radio host famous for exposing corru...,False,fiffers


In [14]:
df['down_votes'].value_counts()

0    509236
Name: down_votes, dtype: int64

In [15]:
# No downvotes in dataframe, so this feature can be dropped
df.drop(columns='down_votes', inplace=True)

In [16]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,title,over_18,author
0,1201232046,2008-01-25,3,Scores killed in Pakistan clashes,False,polar
1,1201232075,2008-01-25,2,Japan resumes refuelling mission,False,polar
2,1201232523,2008-01-25,3,US presses Egypt on Gaza border,False,polar
3,1201233290,2008-01-25,1,Jump-start economy: Give health care to all,False,fadi420
4,1201274720,2008-01-25,4,Council of Europe bashes EU&UN terror blacklist,False,mhermans


---

In [17]:
print(f"Number of unique authors: {len(df['author'].unique())}")
print('-----')
print(f"Top 20 contributors by post count: \n{df['author'].value_counts()[0:20]}")
print('-----')
print(f"Top 20 contributors by upvotes: \n{df['up_votes'].groupby(df['author']).sum().sort_values(ascending=False)[0:20]}")

Number of unique authors: 85838
-----
Top 20 contributors by post count: 
davidreiss666         8897
anutensil             5730
DoremusJessup         5037
maxwellhill           4023
igeldard              4013
readerseven           3170
twolf1                2923
madam1                2658
nimobo                2564
madazzahatter         2503
ionised               2493
NinjaDiscoJesus       2448
bridgesfreezefirst    2405
SolInvictus           2181
Libertatea            2108
vigorous              2077
galt1776              1897
DougBolivar           1770
bob21doh              1698
trot-trot             1649
Name: author, dtype: int64
-----
Top 20 contributors by upvotes: 
author
maxwellhill         1985416
anutensil           1531544
Libertatea           832102
DoremusJessup        584380
Wagamaga             580121
NinjaDiscoJesus      492582
madazzahatter        428966
madam1               390541
davidreiss666        338306
kulkke               333311
pnewell              297270
nimob

---

In [18]:
# Looking at distribution of 'over_18' posts by number and percentage
print(df['over_18'].value_counts())
print(df['over_18'].value_counts(normalize=True))

False    508916
True        320
Name: over_18, dtype: int64
False    0.999372
True     0.000628
Name: over_18, dtype: float64


In [19]:
# Checking title content of some of the posts classified as "over_18"
df[df['over_18'] == True]

Unnamed: 0,time_created,date_created,up_votes,title,over_18,author
1885,1206381438,2008-03-24,189,Pics from the Tibetan protests - more graphic ...,True,pressed
6721,1211138718,2008-05-18,5,"MI5 linked to Max Mosley’s Nazi-style, sadomas...",True,alllie
8414,1212694925,2008-06-05,0,Tabloid Horrifies Germany: Poland s Yellow Pre...,True,stesch
12163,1216672016,2008-07-21,0,Love Parade Dortmund: Techno Festival Breaks R...,True,stesch
12699,1217381380,2008-07-30,5,IDF kills young Palestinian boy. Potentially N...,True,cup
...,...,...,...,...,...,...
503776,1477889966,2016-10-31,4,Latest Italian Earthquake Devastates Medieval ...,True,pixelinthe
508067,1479400229,2016-11-17,12,ISIS Release Video Showing Melbourne As A Poss...,True,halacska
508176,1479434681,2016-11-18,0,Animal welfare activists have released footage...,True,NinjaDiscoJesus
508376,1479492875,2016-11-18,6,Jungle Justice : Public lynching of a street ...,True,avivi_


In [20]:
nsfw = df[df['over_18'] == True]
nsfw.sort_values(by='up_votes', ascending=False)[0:25]

Unnamed: 0,time_created,date_created,up_votes,title,over_18,author
500590,1476806936,2016-10-18,7941,"Judge presiding over El Chapo s case shot, k...",True,IsleCook
494536,1474805114,2016-09-25,6322,[NSFL] Australian child molester Peter Scully ...,True,ExWhySaid
428689,1452167289,2016-01-07,5878,Armed suspect shot dead after trying to storm ...,True,rawmas02
462067,1463480226,2016-05-17,5617,Syria Army killed over 200 ISIS militants in 3...,True,orangeflower2015
303900,1409942733,2014-09-05,5507,Man escapes ISIS execution,True,brothamo
461255,1463150094,2016-05-13,4839,ISIS massacre 14 Real Madrid fans at supporter...,True,PeterG92
376435,1434501068,2015-06-17,4209,The fight is on to stop an annual Chinese even...,True,ShakoWasAngry
269963,1398120460,2014-04-21,3831,China: “Violent Government Thugs” Beaten To De...,True,helpmesleep666
431221,1453095585,2016-01-18,3823,ISIS commits largest massacre since Syrian con...,True,AllenDono
246618,1390489347,2014-01-23,3738,Video of riot police stripping detained protes...,True,_skylark


---

## Feature Engineering

In [26]:
df['post_length'] = ''
c=0
for t in df['title']:
    df['post_length'][c] = len(t)
    c+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['post_length'][c] = len(t)


KeyboardInterrupt: 

In [34]:
df['post_length_chars'] = df['title'].apply(len)

In [35]:
df

Unnamed: 0,time_created,date_created,up_votes,title,over_18,author,post_length,post_length_chars
0,1201232046,2008-01-25,3,Scores killed in Pakistan clashes,False,polar,33,33
1,1201232075,2008-01-25,2,Japan resumes refuelling mission,False,polar,32,32
2,1201232523,2008-01-25,3,US presses Egypt on Gaza border,False,polar,31,31
3,1201233290,2008-01-25,1,Jump-start economy: Give health care to all,False,fadi420,44,44
4,1201274720,2008-01-25,4,Council of Europe bashes EU&UN terror blacklist,False,mhermans,47,47
...,...,...,...,...,...,...,...,...
509231,1479816764,2016-11-22,5,Heil Trump : Donald Trump s alt-right white...,False,nonamenoglory,,88
509232,1479816772,2016-11-22,1,There are people speculating that this could b...,False,SummerRay,,67
509233,1479817056,2016-11-22,1,Professor receives Arab Researchers Award,False,AUSharjah,,41
509234,1479817157,2016-11-22,1,Nigel Farage attacks response to Trump ambassa...,False,smilyflower,,55


---

## Data Visualizations