<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#|-Preliminaries" data-toc-modified-id="|-Preliminaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>| Preliminaries</a></span></li><li><span><a href="#|-Cleaning" data-toc-modified-id="|-Cleaning-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>| Cleaning</a></span></li><li><span><a href="#|-Named-Entity-Recognition-w/-spaCy" data-toc-modified-id="|-Named-Entity-Recognition-w/-spaCy-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>| Named Entity Recognition w/ spaCy</a></span><ul class="toc-item"><li><span><a href="#Split-first,-middle-and-last-name-in-different-cols" data-toc-modified-id="Split-first,-middle-and-last-name-in-different-cols-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Split first, middle and last name in different cols</a></span></li><li><span><a href="#|-Enrich-first-name-if-only-last-name-is-mentioned" data-toc-modified-id="|-Enrich-first-name-if-only-last-name-is-mentioned-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>| Enrich first name if only last name is mentioned</a></span></li></ul></li><li><span><a href="#|-Gender-Guesser" data-toc-modified-id="|-Gender-Guesser-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>| Gender Guesser</a></span></li><li><span><a href="#|-Group-per-article-and-get-share-of-gender-per-article" data-toc-modified-id="|-Group-per-article-and-get-share-of-gender-per-article-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>| Group per article and get share of gender per article</a></span></li><li><span><a href="#|-Wikidata-Query" data-toc-modified-id="|-Wikidata-Query-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>| Wikidata Query</a></span></li></ul></div>

# | Preliminaries

In [1]:
import os
import json
import pandas as pd
import numpy as np
import regex as re
from pandarallel import pandarallel
import pathlib

#from bs4 import BeautifulSoup as bs

# SpaCy 
import spacy
import en_core_web_trf

nlp = spacy.load("en_core_web_trf")

from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

import warnings
warnings.filterwarnings('ignore') # (action='once')

In [2]:
pandarallel.initialize(progress_bar=True)
#pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [3]:
def splitFile(TRECfile, quantity):
    """
    TREC_Washington_Post_collection.v4.jl has 728626 Lines and has 15GB.
    It is to large to read into one Dataframe for the most PCs.
    With this function you can split the big TREC-File into as many
    small files you want.
    
    args: 
    TRECfile = Complete Path + Filename
    quantitiy = How many Small files you want to create
    """
    
    # Splitting the Filename out of the Path
    TRECpath = TRECfile.split('\\')
    TRECpath = TRECfile[:(len(TRECpath[-1])*-1)-1]
    
    # Creating a subfolder 'small' for the new Files
    TRECpathSmall = os.path.join(TRECpath, 'small')
    if not os.path.exists(TRECpathSmall):
        os.makedirs(TRECpathSmall)
        print(f'{TRECpathSmall} wurde erstellt.')
    
    # If the folder 'small' is noot already there and has data skip this function
    TRECfileSmall = os.listdir(TRECpathSmall)
    if not TRECfileSmall:
        smallfiles = []
        lines_per_file = int(729000/quantity)
        
        smallfile = None
        i = 1
        try:
            with open(TRECfile) as bigfile:
                for lineno, line in enumerate(bigfile):
                    if lineno % lines_per_file == 0:
                        if smallfile:
                            smallfile.close()
                        #small_filename = 'TREC_Washington_Post_small_{}.jl'.format(lineno + lines_per_file)
                        small_filename = f'TREC_Washington_Post_small_{i}.jl'
                        small_filename = os.path.join(TRECpathSmall, small_filename)
                        print(f'{small_filename} wird erstellt')
                        smallfiles.append(small_filename)
                        i+=1
                        smallfile = open(small_filename, "w")
                    smallfile.write(line)
                if smallfile:
                    smallfile.close()
            return smallfiles
        except:
            print('TREC_Washington_Post_collection.v4.jl nicht gefunden')
    else:
        # If the folder 'small' is already there and has data create a list of the data
        TRECfileSmall = os.listdir(TRECpathSmall)
        smallfiles = []
        for smallfile in TRECfileSmall:
            smallfile = os.path.join(TRECpathSmall, smallfile)
            smallfiles.append(smallfile)
        return smallfiles

# PATH TO YOUR TREC_Washington_Post_collection JSON
myTRECfile = 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\TREC_Washington_Post_collection.v4.jl'
WpDataSmall = splitFile(myTRECfile, 30)

WpDataSmall

['C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post1.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post2.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post3.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post4.pickle']

In [4]:
def SmallFilesToPickle(listOfSmallFiles):
    fileending = listOfSmallFiles[0]
    fileending = pathlib.Path(fileending).suffix
    fileending = fileending[1:]
    if fileending == 'jl':
        listOfSmallFilesPickle = []
        for filename in listOfSmallFiles:
            df = pd.read_json(filename, lines=True)

            # Drop empty rows
            df.dropna(inplace = True)

            # delete duplicated and unnecessary columns
            if 'contents' in df.columns:
                df = df.drop(columns='contents')
            #if 'article_url' in df.columns:
            #    df = df.drop(columns='article_url')
            if 'type' in df.columns:
                df = df.drop(columns='type')
            if 'source' in df.columns:
                df = df.drop(columns='source')

            filenamemeameOld = filename
            fileNameNew = filename[:-3]+'.pickle'
            os.remove(filenamemeameOld)
            df.to_pickle(fileNameNew)
            listOfSmallFilesPickle.append(fileNameNew)
            del df
        return listOfSmallFilesPickle
    elif fileending == 'pickle':
        return listOfSmallFiles
    else:
        return None

WpDataSmall = SmallFilesToPickle(WpDataSmall)

WpDataSmall

['C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post1.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post2.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post3.pickle',
 'C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post4.pickle']

In [5]:
# ----------------- LÖSCHEN WENN ALLE FILES GENUTZT WERDEN SOLLEN  --------------

del WpDataSmall[1:]
WpDataSmall

# -------------------------------------------------------------------------------

['C:\\Users\\Felix\\github\\privat\\dis25_abgabe\\data\\wpdata\\data\\small\\TREC_Washington_Post1.pickle']

In [6]:
def createDfFromSmallFiles(listOfSmallFiles):
    fileending = listOfSmallFiles[0]
    fileending = pathlib.Path(fileending).suffix
    fileending = fileending[1:]
    if fileending == 'pickle':
        list_of_dataframes = []
        for filename in listOfSmallFiles:
            to_merge_df = pd.read_pickle(filename)
            list_of_dataframes.append(to_merge_df)

        # write all small dataframes in one big dataframe
        df = pd.concat(list_of_dataframes)        

        # delete the small dataframes for more memory
        del to_merge_df
        del list_of_dataframes

        return df

df = createDfFromSmallFiles(WpDataSmall)

# Drop empty rows
df.dropna(inplace = True)

# Randomize the rows and reset a new id
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
#%%time

'''with open('.../Downloads/WashingtonPost.v4/data/TREC_Washington_Post_collection.v4.jl') as json_file:      
    data = json_file.readlines()
    # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
    data = list(map(json.loads, data)) 

df = pd.DataFrame(data)'''


'''filename = ".../Downloads/WashingtonPost.v4/data/TREC_Washington_Post_collection.v4.jl"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 10000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_json(filename)'''

#df = pd.read_json(".../Downloads/WashingtonPost.v4/data/TREC_Washington_Post_collection.v4.jl", lines=True,nrows=10000)

df

Unnamed: 0,id,article_url,title,author,published_date,type,source,content
0,89a053c4278d9a2f0f33c04dbb063f50,https://www.washingtonpost.com/news/morning-mi...,"Ex-con accused of pretending to be a cop, and ...",Abby Ohlheiser,1413471516000,blog,The Washington Post,"[{'content': 'Morning Mix', 'mime': 'text/plai..."
1,73918e68bbfb57a73fb49228844749ec,https://www.washingtonpost.com/news/dc-sports-...,Stoglin: Maryland would be a top 25 team under...,Dan Steinberg,1397049034000,blog,The Washington Post,"[{'content': 'D.C. Sports Bog', 'mime': 'text/..."
2,6d345b8c44756fecc7c9c686f46497d6,https://www.washingtonpost.com/news/comic-riff...,SMALL PRESS EXPO: Here are your nominees for t...,Michael Cavna,1408381196000,blog,The Washington Post,"[{'content': 'Comic Riffs', 'mime': 'text/plai..."
3,b08ccbe3b2500c74a4a483c847338797,https://www.washingtonpost.com/news/capital-we...,PM Update: Scattered storms overnight; warm wi...,Angela Fritz,1435697890000,blog,The Washington Post,"[{'content': 'Capital Weather Gang', 'mime': '..."
4,6a1b7fb010c266c49059bebe93f47e05,https://www.washingtonpost.com/news/early-lead...,College softball players prove winning at life...,Marissa Payne,1399129205000,blog,The Washington Post,"[{'content': 'Early Lead', 'mime': 'text/plain..."
...,...,...,...,...,...,...,...,...
193187,81e4a998f29267fc704e0b02b91968d1,https://www.washingtonpost.com/news/dc-sports-...,Chris Webber tells Robert Griffin III world’s ...,Dan Steinberg,1399920787000,blog,The Washington Post,"[{'content': 'D.C. Sports Bog', 'mime': 'text/..."
193188,0f01dc61c03c3070ad6aabdfe59da8cc,https://www.washingtonpost.com/blogs/compost/w...,"Dennis Rodman’s weird, terrible, un-Wonderful ...",Alexandra Petri,1379437363000,blog,The Washington Post,"[{'content': 'ComPost', 'mime': 'text/plain', ..."
193189,a260f536f1e41dbcf208ca7ebfd762b3,https://www.washingtonpost.com/news/post-polit...,Cleveland and Dallas are finalists to host 201...,Sean Sullivan,1403721565000,blog,The Washington Post,"[{'content': 'Post Politics', 'mime': 'text/pl..."
193190,46ac96ce-482c-11e1-bfd9-c630ec256905,https://www.washingtonpost.com/sports/highscho...,"Boys’ basketball: At 5-9, Ashe plays big for F...",James Wagner,1327609413000,article,The Washington Post,"[{'content': 'AllMetSports', 'mime': 'text/pla..."


In [8]:
def dateconvert(x):
    try:
        import datetime  
        ts = datetime.datetime.fromtimestamp(x.published_date/1000)
        # year-month-day with hours-minutes-seconds
        #date = ts.strftime('%Y-%m-%d %H:%M:%S')
        # year-month-day without time
        date = ts.strftime('%Y-%m-%d')
        return date
    except:
        return x.published_date

# converts the epochs datetime to a normale datestring
df['published_date'] = df.parallel_apply(dateconvert, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24149), Label(value='0 / 24149')))…

In [9]:
def createText(x):
    try:
        import pandas as pd
        dfcontent = pd.json_normalize(x.content, max_level=1)
        dfcontent = dfcontent[dfcontent.subtype == 'paragraph']
        #dfcontent = dfcontent[dfcontent.mime == 'text/html']
        contentText = ''
        
        for index, row in dfcontent.iterrows():
            contentText += row["content"]
            contentText += ' '
            
            import re
            contentText = re.sub('<[^<]+?>', '', contentText)
            contentText = contentText.replace(u'\xa0', u' ')
        #return dfcontent
        return contentText
    except:
        return None

# converts the epochs datetime to a normale datestring
df['text'] = df.parallel_apply(createText, axis=1)

# if content json is converted in text delete the column
# !!! df = df.drop(columns='content')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24149), Label(value='0 / 24149')))…

# | Cleaning

In [10]:
df

Unnamed: 0,id,article_url,title,author,published_date,type,source,content,text
0,89a053c4278d9a2f0f33c04dbb063f50,https://www.washingtonpost.com/news/morning-mi...,"Ex-con accused of pretending to be a cop, and ...",Abby Ohlheiser,2014-10-16,blog,The Washington Post,"[{'content': 'Morning Mix', 'mime': 'text/plai...","When Roberto Eddy Santos left prison, he chang..."
1,73918e68bbfb57a73fb49228844749ec,https://www.washingtonpost.com/news/dc-sports-...,Stoglin: Maryland would be a top 25 team under...,Dan Steinberg,2014-04-09,blog,The Washington Post,"[{'content': 'D.C. Sports Bog', 'mime': 'text/...",As news came out Monday afternoon that three h...
2,6d345b8c44756fecc7c9c686f46497d6,https://www.washingtonpost.com/news/comic-riff...,SMALL PRESS EXPO: Here are your nominees for t...,Michael Cavna,2014-08-18,blog,The Washington Post,"[{'content': 'Comic Riffs', 'mime': 'text/plai...","Kim Deitch, Nick Drnaso, Sophie Goldstein and ..."
3,b08ccbe3b2500c74a4a483c847338797,https://www.washingtonpost.com/news/capital-we...,PM Update: Scattered storms overnight; warm wi...,Angela Fritz,2015-06-30,blog,The Washington Post,"[{'content': 'Capital Weather Gang', 'mime': '...","(post originally published at 4:58 p.m., updat..."
4,6a1b7fb010c266c49059bebe93f47e05,https://www.washingtonpost.com/news/early-lead...,College softball players prove winning at life...,Marissa Payne,2014-05-03,blog,The Washington Post,"[{'content': 'Early Lead', 'mime': 'text/plain...",It’s not every day that your rivals feel like ...
...,...,...,...,...,...,...,...,...,...
193187,81e4a998f29267fc704e0b02b91968d1,https://www.washingtonpost.com/news/dc-sports-...,Chris Webber tells Robert Griffin III world’s ...,Dan Steinberg,2014-05-12,blog,The Washington Post,"[{'content': 'D.C. Sports Bog', 'mime': 'text/...","Possibly for the last time this postseason, Ch..."
193188,0f01dc61c03c3070ad6aabdfe59da8cc,https://www.washingtonpost.com/blogs/compost/w...,"Dennis Rodman’s weird, terrible, un-Wonderful ...",Alexandra Petri,2013-09-17,blog,The Washington Post,"[{'content': 'ComPost', 'mime': 'text/plain', ...",When confronted with Dennis Rodman’s new North...
193189,a260f536f1e41dbcf208ca7ebfd762b3,https://www.washingtonpost.com/news/post-polit...,Cleveland and Dallas are finalists to host 201...,Sean Sullivan,2014-06-25,blog,The Washington Post,"[{'content': 'Post Politics', 'mime': 'text/pl...",And then there were two. The Republican Nation...
193190,46ac96ce-482c-11e1-bfd9-c630ec256905,https://www.washingtonpost.com/sports/highscho...,"Boys’ basketball: At 5-9, Ashe plays big for F...",James Wagner,2012-01-26,article,The Washington Post,"[{'content': 'AllMetSports', 'mime': 'text/pla...","At first glance, Friendship Collegiate’s slend..."


In [None]:
with pd.option_context('display.max_colwidth', None):
  display(df[:2])

In [None]:
def cleaning_for_NER(df):
    
    '''dictionary = {
              "id"                     : "oid",
              "name"                   : "article_title",
              'teaser'                 : 'article_teaser',
              "text"                   : "article_text",
              "regions"                : "cms_regions"
                }
    '''
    df = df.copy()
    #df.rename(columns=dictionary, inplace=True)  # rename columns
    df.drop_duplicates(subset=['id'], inplace=True)

    # Remove "\n" in article_text
    df["content"] = df["content"].apply(lambda x: re.sub("\{'content': '"," ", str(x)))
 
    return df    

In [None]:
# Safe cleaned data in new df
df = cleaning_for_NER(df)

In [None]:
#df.drop(columns = ['contents'], inplace = True)
df

In [None]:
# drop empty rows
nan_value = float("NaN")
df.replace("", 'no title', inplace=True)
df["title"].replace(np.NaN, 'no title', inplace=True)

#df.dropna(subset = ["title"], inplace=True)

In [None]:
# Combine column "title" & "content" into "merged_total_text":
df['merged_total_text'] = df.apply(lambda r: r['title'] + r['content'], axis=1)

In [None]:
print(df.head(2))

# | Named Entity Recognition w/ spaCy

In [None]:
df_1 = df.copy()

In [None]:
# write entity in parsed_articles 
import tqdm
parsed_articles = []
for article, id in tqdm.tqdm(zip(df_1['content'], df_1['id']), total=len(df_1)):
    parsed_articles.extend([[id, str(entity), entity.label_] for entity  in nlp(article).ents])

In [None]:
# create dfs from entities
df_2 = pd.DataFrame(parsed_articles, columns=['id', 'content', 'entity_type'])

# rename columns for one df
df_2.rename(columns={'content': 'entity'}, inplace=True)

In [None]:
# merge back to initial df
df = (pd.merge(df_1, df_2, on='id'))

In [None]:
df.to_pickle(r"Washington_Post_NER_all_10K.pkl")

In [None]:
df_ner = df.copy()

In [None]:
df = pd.read_pickle("/Users/landsiedelj/Downloads/Washington_Post_NER_all_10K.pkl")

In [None]:
# NER 'PERSON' STARTS HERE
df = (df.loc[df['entity_type'].isin(['PERSON'])])

In [None]:
df["entity"] = df['entity'].str.replace('[\[\]\"\'\d\,\<\/]','')


In [None]:
# Replace artist names (Sting, Bono, Cher etc.) with name according to wikipedia
# TO do: write into dict?
df['entity'] = df['entity'].str.replace('JFK','John Fitzgerald Kennedy')
df['entity'] = df['entity'].str.replace('FDR','Franklin Delano Roosevelt')
df['entity'] = df['entity'].str.replace('Sting','Gordon Matthew Sumner')
df['entity'] = df['entity'].str.replace('Bono','Paul David Hewson')
df['entity'] = df['entity'].str.replace('Cher','Cherilyn Sarkisian')
df['entity'] = df['entity'].str.replace('Madonna','Madonna Louise Ciccone')
df['entity'] = df['entity'].str.replace('Adele','Adele Laurie Adkins')
df['entity'] = df['entity'].str.replace('Eminem','Marshall Bruce Mathers')
df['entity'] = df['entity'].str.replace('Beyonce','Beyoncé Knowles-Carter')
df['entity'] = df['entity'].str.replace('Blaine Friedlander','Blaine P. Friedlander Jr.')

In [None]:
df[:40]

In [None]:
df = df[df['author'] != df['entity']] # if author is in col entity -> remove, we only want "Person" from within the article
# To do: Blaine P. Friedlander Jr. vs. Blaine Friedlander 

## Split first, middle and last name in different cols

In [None]:
# Write first, middle and last name in different cols
# to do: split middle and last name does not work
def split_name(df, var):
    sub_df = df[var].str.split('\\s+', expand=True)
    result = []

    for _, row in sub_df.iterrows():
        info = {'first_name': '', 'middle_name': '', 'last_name': ''}
        n = row.count()

        if n == 0:
            pass
        elif n == 1:
            info['last_name'] = row.iloc[0]
        elif n == 2:
            info['first_name'], info['last_name'] = row.iloc[:2]
        else:
            info['first_name'] = row.iloc[0]
            info['last_name'] = row.iloc[-1]
            info['middle_name'] = ' '.join([(string or '') for string in row.iloc[1:-1]])
        result.append(info)
    return pd.DataFrame(result, index=df.index)

df_names = split_name(df, 'entity')
df = df.join(df_names)

## | Enrich first name if only last name is mentioned

In [None]:
names = df.groupby('id')['entity'].apply(lambda x: list(np.unique(x)))
# To do: if only one token ("Obama") drop or don't write to list

In [None]:
# function for enriching first name if only last name is mentioned
from tqdm import tqdm
tqdm.pandas()
def enrich_firstname(row):
    
    entity = row['entity']#.copy()
    firstname = row['first_name']#.copy()
    lastname = row['last_name']#.copy()
    
    
    #if len(firstname) == 0:
        # look in list per id for key and write value?
    # elif 
    # bestehende Vornamen nicht überschreiben
    #else:
    #    row['first_name'] = 'unknown'
    return row

df_test = df_test.progress_apply(enrich_firstname, axis=1)


# | Gender Guesser

In [None]:
# !pip install gender_guesser    
import gender_guesser.detector as gender
gd = gender.Detector()
df['gender_guesser'] = df['first_name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))

> __unknown__ (name not found), __andy__ (androgynous), __male__, __female__, __mostly_male__, or __mostly_female__. The difference between andy and unknown is that the former is found to have the same probability to be male than to be female, while the later means that the name wasn’t found in the database.

In [None]:
df.gender_guesser.value_counts()

In [None]:
df.loc[df['gender_guesser'] == 'unknown'][:40]

In [None]:
# df.first_name.value_counts()[:40]

In [None]:
# to do:
# Namen cleanen
# weitere lib für gender identification einbauen
# function schreiben: wenn nur Nachname genannt, schaue in Liste/anderer Zeile pro Artikel, ob Name schon genannt, dann Vorname auffüllen
# oder
# wenn Obama, Santorum, Gingrich, etc. dann aus erstellter Liste oder via Wikidata mit Vornamen auffüllen
# # dominique rodgers cromartie etc. (famous people)  -> Wikidata? 

# | Group per article and get share of gender per article

In [None]:
# next

# | Wikidata Query

In [None]:
# oder das https://stackoverflow.com/questions/51419785/extract-data-from-wikidata-in-python

import requests

sparql_query = """
        prefix schema: <http://schema.org/>
        SELECT ?item ?occupation ?genderLabel ?bdayLabel
        WHERE {
            <https://en.wikipedia.org/wiki/Angela_Merkel> schema:about ?item .
            ?item wdt:P21 ?gender .
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
        }
    """

In [None]:
url = 'https://query.wikidata.org/sparql'

# sleep(2)
r = requests.get(url, params={'format': 'json', 'query': sparql_query})

In [None]:
url = 'https://query.wikidata.org/sparql'

r = requests.get(url, params={'format': 'json', 'query': sparql_query})
data = r.json()

print(data['results']['bindings'])