# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

import os
import pickle
from tqdm import tqdm

# Connect tqdm to pandas
tqdm.pandas()

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

After scraping the files from the NYT articles and comments API, I was left with pickle files for every comment and article for each month. Together, the size of the article files were about 19GB, while the comment pickle files were about 3GB. This was initially an issue as it was slightly too big to process with, so I experimented with Dask and SQL before ultimately I settled on reducing the size of my datasets by dropping unnecessary columns and text.

I had several objectives in this notebook:
1. Calculate number of comments for each article by joining each comment and article file on `ArticleID`.
2. Create a classification category `is_popular` by sorting comments above and below a certain number.
3. Create a train and test dataset for classification.
4. Reduce file size by dropping unnecessary columns.
5. Avoid copyright issues by dropping article text and only keeping the headline and abstract.


### Summary of Changes 

<b>Comments:</b> 
1. Dropped `status`, `commentTitle`, `userURL`, `picURL`, `userLocation`, `userDisplayName`, `trusted`, `isAnonymous`, `updateDate` `permID` and `parentUserDisplayName` to slightly reduce file size and avoid the risk of accidentally doxxing NYT commenters.

<b>Articles:</b>
1. Dropped `print_section` & `print_page` as we are only concerned with online articles.
2. Dropped most `headline.*` features as they contain a large number of null values, which make them unuseful as predictors for modelling.
3. Dropped `text` & `html` to avoid copyright issues.
4. Dropped `uri` as it seems to be the same as article ID
5. Dropped `byline.organization` as it has a high amount of null values.
6. Dropped `byline.person` as `byline.original` is more informative
7. Changed `keywords` data structure from nested dictionary to list of keywords per article
8. Renamed and re-organized columns to make them a bit more user-friendly.

In [2]:
com_folder_path = os.path.normpath(r'C:\Users\benja\OneDrive\Desktop\General Assembly\Capstone\Datasets\comments')
art_folder_path = os.path.normpath(r'C:\Users\benja\OneDrive\Desktop\General Assembly\Capstone\Datasets\articles')

# Obtaining dates in file -- e.g. 2020-01, 2020-02, 2020-03 
com_file_list = [file[0:7] for file in os.listdir(com_folder_path)]
art_file_list = [file[0:7] for file in os.listdir(art_folder_path)]

# Checking that dates for both sets of files match up
if com_file_list != art_file_list:
    print('Error - check date')

#### Comments

In [28]:
comment_list = []

for i in tqdm(com_file_list):
    with open(f'./datasets/comments/{i}-comments.pickle', 'rb') as c:
        comments_df = pickle.load(c).reset_index()
        comments_df = comments_df.drop
        comment_list.append(comments_df)

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:12<00:00,  1.04s/it]


In [31]:
new_comments_df = pd.concat(comment_list).reset_index(drop=True)

In [33]:
new_comments_df.drop(columns=['status', 'commentTitle', 'userURL', 'picURL', 'userLocation', 
                          'userDisplayName', 'trusted', 'isAnonymous', 'updateDate', 'permID',
                          'parentUserDisplayName'], errors='ignore')

Unnamed: 0,commentID,commentSequence,userID,userTitle,commentBody,createDate,approveDate,recommendations,replyCount,editorsSelection,parentID,depth,commentType,recommendedFlag,articleID
0,104387472,104387472,60215558,,Here is something I think is fraudulent that v...,2020-01-01 01:05:46,2020-01-01 01:05:47,7,5,False,,1,comment,0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
1,104387873,104387873,65691034,,@magicisnotreal I have used my VA loan option...,2020-01-01 01:52:25,2020-01-01 01:52:26,17,0,False,104387472.0,2,userReply,0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
2,104387976,104387976,65110053,,@magi\n\nWhy would someone take out a VA loan ...,2020-01-01 02:06:05,2020-01-01 02:06:06,8,0,False,104387472.0,2,userReply,0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
3,104390628,104390628,60215558,,@JD\nOut here in the Alabama of the PNW they w...,2020-01-01 14:38:50,2020-01-01 14:38:52,1,0,False,104387873.0,2,userReply,0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
4,104391463,104391463,65691034,,@magicisnotreal just a guess but I doubt that...,2020-01-01 16:23:14,2020-01-01 16:23:15,1,0,False,104390628.0,2,userReply,0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4986456,110852726,110852726,13499330,,"A country with a vast intelligence apparatus, ...",2021-01-01 22:37:00,2021-01-01 22:37:02,3,0,False,,1,comment,0,nyt://article/12048b2b-62e3-5bed-8c77-483a4299...
4986457,110852733,110852733,89151844,,"I hate to burst anyone's bubble, but once you ...",2021-01-01 22:39:04,2021-01-01 22:39:05,2,0,False,,1,comment,0,nyt://article/12048b2b-62e3-5bed-8c77-483a4299...
4986458,110852904,110852904,4736419,,Why does the U.S. Government put so much data ...,2021-01-01 23:02:14,2021-01-01 23:02:15,8,0,False,,1,comment,0,nyt://article/12048b2b-62e3-5bed-8c77-483a4299...
4986459,110847985,110847985,80310536,,"Another comment said (and I paraphrase): ""Assu...",2021-01-01 15:52:17,2021-01-02 00:16:57,5,0,False,,1,comment,0,nyt://article/12048b2b-62e3-5bed-8c77-483a4299...


In [34]:
comments_df.to_csv('./datasets/NYT_2020_comments_final.csv', index=False)

#### Articles

In [3]:
# Merge articles into single dataframe
df_list = []

for i in tqdm(art_file_list):
    with open(f'./datasets/articles/{i}-articles.pickle', 'rb') as a, \
    open(f'./datasets/comments/{i}-comments.pickle', 'rb') as c:
        
        # Load pickle as dataframe and reset index
        articles_df = pickle.load(a).reset_index()
        comments_df = pickle.load(c).reset_index()
        
         # Drop redundant columns
        articles_df.drop(columns=['print_section', 'print_page', 'headline.content_kicker',
                                  'headline.print_headline', 'headline.name', 'headline.seo', 'headline.sub', 
                                  'text', 'html', 'uri', 'byline.person', 'byline.organization', 'source', 'multimedia',
                                 'snippet', 'document_type', 'headline.kicker', 'lead_paragraph'], 
                                    inplace=True, errors='ignore')
       
        # Access number of comments in df
        comment_series = comments_df['articleID'].value_counts().to_frame().reset_index()
        comment_series.columns=['_id', 'n_comments']
        
        # Create new df only for articles with comments
        new_df = articles_df.merge(comment_series, on=['_id'])
        df_list.append(new_df)

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:39<00:00,  3.28s/it]


In [4]:
new_article_df = pd.concat(df_list).reset_index(drop=True)

In [5]:
# Pickling to preserve data structure
new_article_df.to_pickle(f'./datasets/main.pickle')

#### Setting up Keywords

In [6]:
# Keywords are contained in dictionaries
new_article_df['keywords']

0        [{'name': 'subject', 'value': 'Veterans', 'ran...
1        [{'name': 'subject', 'value': 'Crossword Puzzl...
2        [{'name': 'subject', 'value': 'Meteors and Met...
3        [{'name': 'subject', 'value': 'Space and Astro...
4        [{'name': 'subject', 'value': 'Space and Astro...
                               ...                        
16782    [{'name': 'subject', 'value': 'Chronic Conditi...
16783    [{'name': 'organizations', 'value': 'San Diego...
16784    [{'name': 'subject', 'value': 'Banking and Fin...
16785    [{'name': 'subject', 'value': 'Wines', 'rank':...
16786    [{'name': 'organizations', 'value': 'Microsoft...
Name: keywords, Length: 16787, dtype: object

In [7]:
key_df = pd.DataFrame(new_article_df['keywords'].values.tolist(), index=new_article_df.index)

In [8]:
keywords = key_df.applymap(lambda x: x.get('value', np.nan) \
                        if isinstance(x, dict) else np.nan)
keywords.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66
0,Veterans,For-Profit Schools,Financial Aid (Education),Frauds and Swindling,Colleges and Universities,Veterans Affairs Department,Federal Trade Commission,University of Phoenix,Career Education Corporation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Crossword Puzzles,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Meteors and Meteorites,Space and Astronomy,Earth,Solar System,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
keyword_list = keywords.stack().reset_index()[0].value_counts()

In [10]:
keyword_list

Coronavirus (2019-nCoV)                  5380
Trump, Donald J                          2830
United States Politics and Government    2751
Presidential Election of 2020            2071
Biden, Joseph R Jr                       1141
                                         ... 
ACTH (Adrenocorticotropic Hormone)          1
Hopper (Mobile App)                         1
Tersigni, Nicole                            1
Samos (Greece)                              1
Lowry, Erin                                 1
Name: 0, Length: 19234, dtype: int64

In [11]:
def get_keywords(row):
    key_list = []
    for _, i in enumerate(row):
        if i is np.nan:
            return key_list
        else:
            key_list.append(i)

In [12]:
new_article_df['keywords'] = keywords.apply(get_keywords, axis=1)

#### Splitting Data by Month

In [13]:
new_article_df['month'] = new_article_df['pub_date'].apply(lambda x: x.month)

In [14]:
test = new_article_df[new_article_df['month'] >= 10]
train = new_article_df[new_article_df['month'] < 10]

In [15]:
train['is_popular'] = train['n_comments'].apply(lambda x: 1 if x > 90 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['is_popular'] = train['n_comments'].apply(lambda x: 1 if x > 90 else 0)


In [16]:
test['is_popular'] = test['n_comments'].apply(lambda x: 1 if x > 90 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['is_popular'] = test['n_comments'].apply(lambda x: 1 if x > 90 else 0)


In [17]:
train['is_popular'].value_counts(normalize=True)

0    0.503752
1    0.496248
Name: is_popular, dtype: float64

In [18]:
train = train.drop(columns=['month', 'byline.original', 'web_url', 'lead_paragraph'], errors='ignore')
test = test.drop(columns=['month', 'n_comments', 'byline.original', 'web_url', 'lead_paragraph'], errors='ignore')

#### Renaming Columns

In [19]:
train.columns = ['uniqueID', 'abstract', 'keywords', 'pub_date', 'newsdesk', 'section', 'material', 'word_count', 'headline',
                'subsection', 'n_comments', 'is_popular']

In [20]:
train = train[['newsdesk', 'section', 'subsection', 'material', 'headline', 'abstract', 'keywords', 'word_count', 'pub_date', 
               'is_popular', 'n_comments', 'uniqueID']]

In [21]:
train.head(3)

Unnamed: 0,newsdesk,section,subsection,material,headline,abstract,keywords,word_count,pub_date,is_popular,n_comments,uniqueID
0,Editorial,Opinion,,Editorial,Protect Veterans From Fraud,Congress could do much more to protect America...,"[Veterans, For-Profit Schools, Financial Aid (...",680,2020-01-01 00:18:54+00:00,1,186,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...
1,Games,Crosswords & Games,,News,‘It’s Green and Slimy’,Christina Iverson and Jeff Chen ring in the Ne...,[Crossword Puzzles],931,2020-01-01 03:00:10+00:00,1,257,nyt://article/9edddb54-0aa3-5835-a833-d311a76f...
2,Science,Science,,News,Meteor Showers in 2020 That Will Light Up Nigh...,"All year long, Earth passes through streams of...","[Meteors and Meteorites, Space and Astronomy, ...",1057,2020-01-01 05:00:08+00:00,0,6,nyt://article/04bc90f0-b20b-511c-b5bb-3ce13194...


In [22]:
test.columns

Index(['_id', 'abstract', 'keywords', 'pub_date', 'news_desk', 'section_name',
       'type_of_material', 'word_count', 'headline.main', 'subsection_name',
       'is_popular'],
      dtype='object')

In [23]:
test.columns = ['uniqueID', 'abstract', 'keywords', 'pub_date', 'newsdesk', 'section', 'material', 'word_count', 'headline',
                'subsection', 'is_popular']

In [24]:
test = test[['newsdesk', 'section', 'subsection', 'material', 'headline', 'abstract', 'keywords',
             'word_count', 'pub_date', 'is_popular', 'uniqueID']]

In [25]:
test.head(5)

Unnamed: 0,newsdesk,section,subsection,material,headline,abstract,keywords,word_count,pub_date,is_popular,uniqueID
12792,OpEd,Opinion,,Op-Ed,Anyone Else Want to See Trump ‘Shut Up’?,Our president as a terrible toddler.,"[Presidential Election of 2020, Biden, Joseph ...",925,2020-10-01 00:05:51+00:00,1,nyt://article/e467c2ae-2df3-5836-a6ca-b23d0d33...
12793,OpEd,Opinion,,Op-Ed,Trump Calls on Extremists to ‘Stand By’,"Instead of condemning violent groups, the pres...","[Presidential Election of 2020, United States ...",902,2020-10-01 00:43:28+00:00,1,nyt://article/9a7ef9e0-1334-56b2-a7f1-288c4887...
12794,OpEd,Opinion,,Op-Ed,"Can Mike Espy Make History, Again?",If the Democratic Party claims to value Black ...,"[Black People, Blacks, Presidential Election o...",1412,2020-10-01 00:45:17+00:00,1,nyt://article/4bb2b763-0088-5e10-b204-19e404f7...
12795,Games,Crosswords & Games,,News,In Which Rikishi Wear Mawashi,Adam Fromm is on the line.,[Crossword Puzzles],849,2020-10-01 02:00:05+00:00,1,nyt://article/0d96205f-edb8-5f1f-8c44-1ddf6ed5...
12796,Sports,Sports,Pro Football,News,N.F.L. Week 4 Predictions: Our Picks Against t...,Tom Brady and the Buccaneers are building mome...,"[Football, New England Patriots, Kansas City C...",2690,2020-10-01 04:01:16+00:00,0,nyt://article/afc8295b-3c22-5a5f-9539-3f77b7b8...


In [26]:
train.to_pickle('./datasets/train.pickle')

In [27]:
test.to_pickle('./datasets/test.pickle')