# World News NLP Project
## Scratchpad
#### Adam Zucker

---

## Data

- __*world_news_posts.csv*:__ Supplied dataframe with roughly 500,000 titles of posts on a "world news" message board, including data for the date, time, and author of the post, along with user interaction.

---

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy import displacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import time
from datetime import datetime

In [None]:
# Reading in data
df = pd.read_csv('../data/world_news_posts.csv')

In [None]:
df.head()

---

## EDA

In [None]:
df.info()

In [None]:
# Checking for nulls in the dataframe - none detected
df.isnull().sum()

In [None]:
# The data spans 3223 days, from 1/25/08 to 11/22/16
print(f"Number of days represented in dataframe: {len(df['date_created'].unique())}")
print(f"Data date range is from {min(df['date_created'])} to {max(df['date_created'])}")

In [None]:
# Defining a function to concisely process this dataframe and others in the same format
def process_data(df):
    
    # Redefining the 'time_created' column to hold datetime, converted from unix timestamp format
    df['time_created'] = [datetime.fromtimestamp(ts) for ts in df['time_created']]
    # Dropping 'date_created' because of redundancy
    df.drop(columns='date_created', inplace=True)
    
    # Creating a feature to hold the post length in characters and words
    df['post_length_chars'] = df['title'].apply(len)
    df['post_length_tokens'] = df['title'].str.split().apply(len)
    
#     # Generating features to hold total author posts and total author upvotes alongside each post
#     df['author_posts'] = df['author'].groupby(df['author']).transform('count')
#     df['author_upvotes'] = [df['up_votes'].groupby(df['author']).sum() for a in df['author']]
    
    # Generating a feature to hold day of the week and dummifying
    df['weekday'] = df['time_created'].dt.day_name()
    day_dummies = pd.get_dummies(df['weekday'], drop_first=True)
    df = pd.concat([df, day_dummies], axis=1)
    df.drop(columns='weekday', inplace=True)
    
    # Dropping 'category' feature if only one category is present
    if len(df['category'].unique()) == 1:
        df.drop(columns='category', inplace=True)
    # Similarly dropping down votes if there are none reported
    if sum(df['down_votes']) == 0:
        df.drop(columns='down_votes', inplace=True)
    
    # Binarizing 'over_18' feature
    df['over_18'] = df['over_18'].map({False:0, True:1})
    

    
    return df

In [None]:
# df['up_votes'].groupby(df['author']).sum()

In [None]:
df = process_data(df)

In [None]:
df.head(3)

In [None]:
# # Converting 'date_created' to datetime
# df['date_created'] = pd.to_datetime(df['date_created'])

In [None]:
df.dtypes

In [None]:
# # All posts are classified as 'worldnews' - with just a single class represented, this feature becomes unnecessary
# df['category'].value_counts()

In [None]:
# # Dropping 'category' feature
# df.drop(columns='category', inplace=True)

---

In [None]:
# Summary stats for upvotes
df['up_votes'].describe()

In [None]:
# Looking at titles of most upvoted posts
df['up_votes'].groupby(df['title']).sum().sort_values(ascending=False)[0:10].to_frame()

In [None]:
df.sort_values('up_votes', ascending=False)[0:10]

In [None]:
df.head()

---

In [None]:
print(f"Number of unique authors: {len(df['author'].unique())}")
print('-----')
print(f"Top 20 contributors by post count: \n{df['author'].value_counts()[0:20]}")
print('-----')
print(f"Top 20 contributors by upvotes: \n{df['up_votes'].groupby(df['author']).sum().sort_values(ascending=False)[0:20]}")

---

In [None]:
# Looking at distribution of 'over_18' posts by number and percentage
print(df['over_18'].value_counts())
print(df['over_18'].value_counts(normalize=True))

In [None]:
# Checking title content of some of the posts classified as "over_18"
df[df['over_18'] == True]

In [None]:
nsfw = df[df['over_18'] == True]
nsfw.sort_values(by='up_votes', ascending=False)[0:10]

---

## Feature Engineering

---

## Data Visualizations

---
---
## NLP

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
test_title = df['title'][111111]

In [None]:
df['title'][111111]

In [None]:
# From https://spacy.io/ demo code
doc = nlp(test_title)

print([noun_phrases.text for noun_phrases in doc.noun_chunks])
print('-----')
print([token.lemma_ for token in doc if token.pos_ == "VERB"])
print('-----')
for entity in doc.ents:
    print(entity.text, entity.label_)

In [None]:
df.head(1)

In [None]:
df.index

In [None]:
range(len(df.index))

In [None]:
print((range(len(df.index)))[-1])

In [None]:
df.tail(1)

---

In [None]:
# # Creating columns of empty lists to hold NLP output

# df['noun_phrases'] = df.apply(lambda value: [], axis=1)
# df['verbs'] = df.apply(lambda value: [], axis=1)
# df['entities'] = df.apply(lambda value: [], axis=1)
# df['entity_labels'] = df.apply(lambda value: [], axis=1)

**NOTE:** The `lambda` function above is necessary since I can't directly assign an empty list as a value to fill the new columns.

In [None]:
# # Instantiating spacy NLP
# nlp = spacy.load('en_core_web_sm')

# # Defining a new function to segment post titles into component pieces and insert into original dataframe
# def title_deconstruct(df):
#     for i in range(len(df.index)):
#         title = df['title'][i]
#         doc = nlp(title)
#         df['noun_chunks'][i] = [noun_chunk.text for noun_chunk in doc.noun_chunks]
#         df['verbs'][i] = [verb.lemma_ for verb in doc if verb.pos_ == "VERB"]
#         df['entities'][i] = [entity.text for entity in doc.ents]
#         df['entity_labels'][i] = [entity.label_ for entity in doc.ents]
#     return df

In [None]:
# title_deconstruct(df)

---

In [None]:
# # Initializing a new, empty dataframe to hold nlp data
# nlp_df = pd.DataFrame(data=None, index=range(len(df.index)), columns=['noun_chunks', 'verbs', 'entities', 'entity_labels'])

In [None]:
# nlp_df.head(3)

In [None]:
# # Instantiating spacy NLP
# nlp = spacy.load('en_core_web_sm')

# # Defining a new function to segment post titles into component pieces and insert into original dataframe
# def title_deconstruct(df):
#     for i in range(len(df)):
#         title = df['title'][i]
#         doc = nlp(title)
#         nlp_df['noun_chunks'][i] = [noun_chunk.text for noun_chunk in doc.noun_chunks]
#         nlp_df['verbs'][i] = [verb.lemma_ for verb in doc if verb.pos_ == "VERB"]
#         nlp_df['entities'][i] = [entity.text for entity in doc.ents]
#         nlp_df['entity_labels'][i] = [entity.label_ for entity in doc.ents]
#     return nlp_df

In [None]:
# nlp_df = title_deconstruct(df)

In [None]:
# nlp_df

In [None]:
# pd.concat([df, nlp_df], axis=1)

---

In [None]:
# # Instantiating spacy NLP
# nlp = spacy.load('en_core_web_sm')

# # Defining a new function to segment post titles into component pieces and insert into original dataframe
# def title_deconstruct(df):
#     for i in range(10):
#         title = df['title'][i]
#         doc = nlp(title)
#         nouns = [noun_chunk.text for noun_chunk in doc.noun_chunks]
#         verbs = [verb.lemma_ for verb in doc if verb.pos_ == "VERB"]
#         entities = [entity.text for entity in doc.ents]
#         ent_labels = [entity.label_ for entity in doc.ents]
#         df['noun_chunks'][i].append(nouns) 
#         df['verbs'][i].append(verbs) 
#         df['entities'][i].append(entities)
#         df['entity_labels'][i].append(ent_labels)
#     return df

In [None]:
# title_deconstruct(df)

In [None]:
df.isnull().sum()

---
---
### This one works!

**BELOW:** This seems to be the best iteration of the function, but is still computationally inefficient.

In [None]:
# Instantiating spacy NLP
nlp = spacy.load('en_core_web_sm')

# Defining a new function to segment post titles into component pieces and insert into original dataframe
def title_deconstruct(df):
    for i in range(len(df)):
        title = df['title'][i]
        doc = nlp(title)
        df.at[i, 'noun_phrases'] = [noun_chunk.text for noun_chunk in doc.noun_chunks]
        df.at[i, 'verbs'] = [verb.lemma_ for verb in doc if verb.pos_ == "VERB"]
        df.at[i, 'entities'] = [entity.text for entity in doc.ents]
        df.at[i, 'entity_labels'] = [entity.label_ for entity in doc.ents]
    return df

In [None]:
# df = title_deconstruct(df)

In [None]:
# df

---

### TEST

Combining it all into a single function

In [2]:
# Reading in data
df = pd.read_csv('../data/world_news_posts.csv')

In [3]:
df.head(3)

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   time_created  509236 non-null  int64 
 1   date_created  509236 non-null  object
 2   up_votes      509236 non-null  int64 
 3   down_votes    509236 non-null  int64 
 4   title         509236 non-null  object
 5   over_18       509236 non-null  bool  
 6   author        509236 non-null  object
 7   category      509236 non-null  object
dtypes: bool(1), int64(3), object(4)
memory usage: 27.7+ MB


**BELOW:** The `process_data` function defined here will generate and populate the existing dataframe with a number of new features, as well as drop unnecessary features.

TO DO
* Fix the upvotes by author feature
* Thoroughly verify data integrity
* Make sure the code is clear and flexible, eg, add loops to account for nulls in source data
* Is there a way to make it more efficient?

In [5]:
# Creating columns of empty lists to hold NLP output

df['noun_phrases'] = df.apply(lambda value: [], axis=1)
df['verbs'] = df.apply(lambda value: [], axis=1)
df['entities'] = df.apply(lambda value: [], axis=1)
df['entity_labels'] = df.apply(lambda value: [], axis=1)



# Defining a function to concisely process this dataframe and others in the same format
def process_data(df):
    
    # Redefining the 'time_created' column to hold datetime, converted from unix timestamp format
    df['time_created'] = [datetime.fromtimestamp(ts) for ts in df['time_created']]
    # Dropping 'date_created' because of redundancy
    df.drop(columns='date_created', inplace=True)
    
    # Dropping 'category' feature if only one category is present
    if len(df['category'].unique()) == 1:
        df.drop(columns='category', inplace=True)
    # Similarly dropping down votes if there are none reported
    if sum(df['down_votes']) == 0:
        df.drop(columns='down_votes', inplace=True)
    
    # Binarizing 'over_18' feature
    df['over_18'] = df['over_18'].map({False:0, True:1})
    
    # Creating a feature to hold the post length in characters and words
    df['post_length_chars'] = df['title'].apply(len)
    df['post_length_tokens'] = df['title'].str.split().apply(len)
    
    # Generating features to hold total author posts and total author upvotes alongside each post
    df['author_posts'] = df['author'].groupby(df['author']).transform('count')
#     df['author_upvotes'] = [df['up_votes'].groupby(df['author']).sum() for a in df['author']]
    
    # Generating a feature to hold day of the week and dummifying
    df['weekday'] = df['time_created'].dt.day_name()
    day_dummies = pd.get_dummies(df['weekday'], drop_first=True)
    df = pd.concat([df, day_dummies], axis=1)
    df.drop(columns='weekday', inplace=True)
    
    # Try insertin NLP feature generation here, instead of outside the function **********
    
    # Instantiating spacy NLP
    nlp = spacy.load('en_core_web_sm')

    # Incorporating the loop from 'title_deconstruct' function to segment post titles into component pieces and insert into original dataframe
    for i in range(len(df)):
        title = df['title'][i]
        doc = nlp(title)
        df.at[i, 'noun_phrases'] = [noun_chunk.text for noun_chunk in doc.noun_chunks]
        df.at[i, 'verbs'] = [verb.lemma_ for verb in doc if verb.pos_ == "VERB"]
        df.at[i, 'entities'] = [entity.text for entity in doc.ents]
        df.at[i, 'entity_labels'] = [entity.label_ for entity in doc.ents]
    
    
    return df

In [6]:
df = process_data(df)

In [7]:
df

Unnamed: 0,time_created,up_votes,title,over_18,author,noun_phrases,verbs,entities,entity_labels,post_length_chars,post_length_tokens,author_posts,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2008-01-24 22:34:06,3,Scores killed in Pakistan clashes,0,polar,"[Scores, Pakistan clashes]",[kill],[Pakistan],[GPE],33,5,50,0,0,0,1,0,0
1,2008-01-24 22:34:35,2,Japan resumes refuelling mission,0,polar,"[Japan, refuelling mission]",[resume],[Japan],[GPE],32,4,50,0,0,0,1,0,0
2,2008-01-24 22:42:03,3,US presses Egypt on Gaza border,0,polar,"[US, Egypt, Gaza border]",[press],"[US, Egypt, Gaza]","[GPE, GPE, GPE]",31,6,50,0,0,0,1,0,0
3,2008-01-24 22:54:50,1,Jump-start economy: Give health care to all,0,fadi420,"[Jump-start economy, health care]",[give],[],[],44,7,2,0,0,0,1,0,0
4,2008-01-25 10:25:20,4,Council of Europe bashes EU&UN terror blacklist,0,mhermans,"[Council, Europe, EU&UN]",[bash],"[Council of Europe, EU&UN]","[ORG, ORG]",47,7,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509231,2016-11-22 07:12:44,5,Heil Trump : Donald Trump s alt-right white...,0,nonamenoglory,"[ Heil Trump, Donald Trump, alt-right white n...","[s, invoke]","[Heil Trump, Donald Trump, Nazi]","[PERSON, PERSON, NORP]",88,13,5,0,0,0,0,1,0
509232,2016-11-22 07:12:52,1,There are people speculating that this could b...,0,SummerRay,"[people, Madeleine McCann]","[speculate, be]",[Madeleine McCann],[PERSON],67,10,1,0,0,0,0,1,0
509233,2016-11-22 07:17:36,1,Professor receives Arab Researchers Award,0,AUSharjah,"[Professor, Arab Researchers Award]",[receive],[Arab],[NORP],41,5,3,0,0,0,0,1,0
509234,2016-11-22 07:19:17,1,Nigel Farage attacks response to Trump ambassa...,0,smilyflower,"[Nigel Farage, response, Trump ambassador tweet]",[attack],"[Nigel Farage, Trump]","[PERSON, ORG]",55,8,52,0,0,0,0,1,0


In [8]:
df.isnull().sum()

time_created          0
up_votes              0
title                 0
over_18               0
author                0
noun_phrases          0
verbs                 0
entities              0
entity_labels         0
post_length_chars     0
post_length_tokens    0
author_posts          0
Monday                0
Saturday              0
Sunday                0
Thursday              0
Tuesday               0
Wednesday             0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   time_created        509236 non-null  datetime64[ns]
 1   up_votes            509236 non-null  int64         
 2   title               509236 non-null  object        
 3   over_18             509236 non-null  int64         
 4   author              509236 non-null  object        
 5   noun_phrases        509236 non-null  object        
 6   verbs               509236 non-null  object        
 7   entities            509236 non-null  object        
 8   entity_labels       509236 non-null  object        
 9   post_length_chars   509236 non-null  int64         
 10  post_length_tokens  509236 non-null  int64         
 11  author_posts        509236 non-null  int64         
 12  Monday              509236 non-null  uint8         
 13  Saturday            509236 no

In [10]:
df.columns

Index(['time_created', 'up_votes', 'title', 'over_18', 'author',
       'noun_phrases', 'verbs', 'entities', 'entity_labels',
       'post_length_chars', 'post_length_tokens', 'author_posts', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [None]:
# Reorder columns as last step in function?