In [1]:
import numpy as np
import pandas as pd
import os
import re
import collections
import time
from nltk.corpus import stopwords
# Cache stopword set so access is faster
stop_words = set(stopwords.words('english'))
stop_words.add("'s")

In [2]:
big_regex = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, stop_words)))

In [3]:
wiki = pd.read_excel('../../Data/Other/wiki_bigrams.xlsx')
wiki['Info1'] = wiki['Info1'].apply(lambda info: str(info))

In [4]:
# wiki['Name'] = wiki['Info1'].apply(lambda text: str(str(text).split(';')[0].lower().strip()))
# wiki['Info1'] = wiki['Info1'].apply(lambda text: ';'.join(str(text).split(';')[1:]))
# wiki = wiki.drop_duplicates(subset=None, keep='first', inplace=False)

In [5]:
meta_data = '../../Data/MetaData'

**Iterate through Directories and Match Corresponding Files**

In [22]:
files = []
for file in os.listdir(meta_data):
    if file != '.DS_Store' and file != '.ipynb_checkpoints' and file != 'README.txt':
        files.append(os.path.join(meta_data, file))

In [23]:
def remove_stopwords(name):
    return big_regex.sub('', name).strip()

def remove_punc(text):
    punctuation = '''(){};'"\,<>/@#%^&*_~'''
    return ';'.join(word.translate(str.maketrans('', '', punctuation)).strip() for word in text.split(';'))

In [24]:
dataframes = []
start = time.time()
for file in files:
    df = pd.read_csv(file)
    df.columns = ['Title', 'Orgs', 'Persons', 'Other']
    df['Persons'] = df['Persons'].apply(lambda per: re.sub('_', ' ', str(per)))
    df['Persons'] = df['Persons'].apply(lambda per: ';'.join(remove_stopwords(name) for name in str(per).split(';')))
    df['Persons'] = df['Persons'].apply(lambda per: remove_punc(per))
    df['Persons Frequency'] = df['Persons'].apply(lambda names: 
                                              collections.Counter([name for name in names.split(';') if len(name.split()) > 1]))
    df['Persons Frequency'] = df['Persons Frequency'].apply(lambda d: 
                                                            ', '.join(['{}: {}'.format(k, v) for k,v in d.items()]))
    # Added 02/22/2022
    df['Persons Frequency'] = df['Persons Frequency'].apply(
        lambda list_of_names: ', '.join(['{}: {}'.format(re.sub(r'\w*\d\w*', '', name.split(': ')[0].strip()), name.split(': ')[1]) for name in str(list_of_names).split(', ') if str(name).split(': ') != ['nan']]))  

    del df['Persons']
    del df['Orgs']
    del df['Other']
    dataframes.append((file, df))
end = time.time()
print('Time to Format Files: {} seconds'.format(end-start))

Time to Format Files: 82.36394596099854 seconds


In [26]:
for entry in dataframes:
    start = time.time()
    entry[1]['Bigrams of Interest'] = None
    for i in range(len(entry[1].index)):
        boi = []
        for name in entry[1].loc[i, 'Persons Frequency'].split(', '):
            # person_ind = wiki.loc[wiki['Name'] == name.split(': ')[0].lower()].index[0]
            if len(wiki.loc[wiki['Name'] == name.split(': ')[0].lower()]) > 0 and len(str(wiki.loc[wiki.loc[wiki['Name'] == name.split(': ')[0].lower()].index[0], 'Info1']).split(';')) > 1:  # If name is in wiki_bigrams 
                boi.append(name)
        if len(boi) > 0:
            entry[1].loc[i, 'Bigrams of Interest'] = ', '.join(boi)
        else:
            entry[1].loc[i, 'Bigrams of Interest'] = 'No Hits'
    end = time.time()
    print('Time Taken to Merge Wiki for {}: {} seconds'.format(entry[0], end-start))

Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.FOXNEWS.2012.20210920.csv: 364.1233208179474 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.Bloomberg.2013.20210920.csv: 15.108757972717285 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.Bloomberg.2014.20210920.csv: 190.24271726608276 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.FOXNEWS.2015.20210920.csv: 335.5486238002777 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.CNN.2012.20210920.csv: 343.12055015563965 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.CNN.2015.20210920.csv: 385.0451169013977 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.MSNBC.2019.20210920.csv: 358.2010838985443 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.FBC.2015.20210920.csv: 172.6181240081787 seconds
Time Taken to Merge Wiki for ../Data/MetaData/Metadata.NERS.FBC.2012.20210920.csv: 40.202451944351196 s

In [27]:
for entry in dataframes:
    entry[1].to_csv('../../OutputPersons/PersonsBreakdown/{}.csv'.format('_'.join(entry[0].split('.')[4:6])), index=False)

In [28]:
attributes = ['Male', 'Female', 'Politician', 'American', 'Foreign', 'Republican', 'Democratic', 'Chief Executive Officer', 
              'Chief Financial Officer', 'Hedge Fund Manager', 'Investor', 'Billionaire', 'Lawyer', 'Television Reporters', 'Television Hosts']

for entry in dataframes:
    start = time.time()
    entry[1]['Count of Unique Indivuals'] = None
    entry[1]['Count by Mention of Individuals'] = None
    for i in range(len(entry[1].index)):
        unique, total = {}, {}
        for att in attributes:  # Initialize entry dictionary
            unique[att] = 0
            total[att] = 0
        if entry[1].loc[i, 'Bigrams of Interest'] != 'No Hits':
            for name in entry[1].loc[i, 'Bigrams of Interest'].split(', '):
                person_index = wiki.loc[wiki['Name'] == name.split(': ')[0].lower()].index[0] # Grab index of person in wiki_bigrams
                for att in attributes: # Find out who they are
                    if att == 'Female':
                        if att.lower() in wiki.loc[person_index, 'Info1'].lower() or 'women' in wiki.loc[person_index, 'Info1'].lower():
                            unique[att] += 1
                            total[att] += 1*int(name.split(': ')[1])
                    elif att == 'Male':
                        if 'female' not in wiki.loc[person_index, 'Info1'].lower() and 'women' not in wiki.loc[person_index, 'Info1'].lower():
                            unique['Male'] += 1
                            total['Male'] += 1*int(name.split(': ')[1])
                    elif att == 'Foreign':
                        if 'american' not in wiki.loc[person_index, 'Info1'].lower():
                            unique['Foreign'] += 1
                            total['Foreign'] += 1*int(name.split(': ')[1])
                    else:
                        if att.lower() in wiki.loc[person_index, 'Info1'].lower():
                            unique[att] += 1
                            total[att] += 1*int(name.split(': ')[1])
            entry[1].loc[i, 'Count of Unique Indivuals'] = ', '.join(['{}: {}'.format(k, v) for k, v in unique.items()])
            entry[1].loc[i, 'Count by Mention of Individuals'] = ', '.join(['{}: {}'.format(k, v) for k, v in total.items()])
        else: 
            entry[1].loc[i, 'Count of Unique Indivuals'] = 'No Hits'
            entry[1].loc[i, 'Count by Mention of Individuals'] = 'No Hits'
    entry[1].to_csv('../../OutputPersons/PersonsBreakdown/{}.csv'.format('_'.join(entry[0].split('.')[4:6])), index=False)
    # entry[1].to_csv(entry[0], index=False)
    end = time.time()
    print('Time: ', end-start, 'seconds for ', entry[0])

Time:  118.95132207870483 seconds for  ../Data/MetaData/Metadata.NERS.FOXNEWS.2012.20210920.csv
Time:  4.959038972854614 seconds for  ../Data/MetaData/Metadata.NERS.Bloomberg.2013.20210920.csv
Time:  58.912769079208374 seconds for  ../Data/MetaData/Metadata.NERS.Bloomberg.2014.20210920.csv
Time:  112.88255906105042 seconds for  ../Data/MetaData/Metadata.NERS.FOXNEWS.2015.20210920.csv
Time:  102.45041799545288 seconds for  ../Data/MetaData/Metadata.NERS.CNN.2012.20210920.csv
Time:  119.15330505371094 seconds for  ../Data/MetaData/Metadata.NERS.CNN.2015.20210920.csv
Time:  119.52055811882019 seconds for  ../Data/MetaData/Metadata.NERS.MSNBC.2019.20210920.csv
Time:  56.22607207298279 seconds for  ../Data/MetaData/Metadata.NERS.FBC.2015.20210920.csv
Time:  12.428621768951416 seconds for  ../Data/MetaData/Metadata.NERS.FBC.2012.20210920.csv
Time:  39.997377157211304 seconds for  ../Data/MetaData/Metadata.NERS.CNBC.2013.20210920.csv
Time:  36.231377840042114 seconds for  ../Data/MetaData/Met