## Function:
This notebook is used to add the attributes of the attributer to the dataframe of the quotebank to facilitate the use of subsequent data analysis.

## Input File:
quotes-[time]-[press]-all.json.bz2: The file of the quotes with selected time, meida and topic.

speaker_attributes.parquet: The file with person's attribute represented by encoded labels.

wikidata_labels_descriptions_quotebank.csv.bz2: The file with meaningful labels corresponding with the encoded lables.

## Output File:
[press]_[time]_with_people.csv: The file with both quotes information and attributers' information.


## Code:

In [2]:
#read the original quote file
import pandas as pd
import numpy as np

file_path = ".\DATA\quotes-2018-nytimes-all.json.bz2"

df = pd.read_json(file_path, compression = 'bz2', lines=True)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,category
0,2018-06-24-002690,And it takes courage because youre going to ge...,Aaron Boone,"[Q4661862, Q4661865]",2018-06-24 03:24:12,2,"[[Aaron Boone, 0.7611], [None, 0.2389]]",[https://www.nytimes.com/2018/06/23/sports/bas...,E,sports
1,2018-12-19-006360,And we need new rules,Kevin Parker,"[Q11738820, Q20985626, Q20985628, Q6397199, Q6...",2018-12-19 00:24:14,1,"[[Kevin Parker, 0.8207], [None, 0.1793]]",[https://www.nytimes.com/2018/12/18/nyregion/k...,E,uncategorized
2,2018-10-15-007043,Angela Merkel has governed for an extraordinar...,Wolfgang Schäuble,[Q16019],2018-10-15 23:53:42,1,"[[Wolfgang Schäuble, 0.9016], [None, 0.0878], ...",[https://www.nytimes.com/2018/10/15/world/euro...,E,politics
3,2018-03-29-009637,Arent insurance companies not allowed to go ba...,Li Yan,"[Q1255395, Q16216647, Q27469859, Q2754794, Q45...",2018-03-29 00:00:00,2,"[[Li Yan, 0.8483], [None, 0.1516]]",[http://www.businesstimes.com.sg/banking-finan...,E,politics
4,2018-01-23-009447,as intricate as the storytelling is seamless,Manohla Dargis,[Q441327],2018-01-23 06:00:20,2,"[[Manohla Dargis, 0.9017], [None, 0.065], [Jak...",[https://www.nytimes.com/2018/01/23/arts/telev...,E,arts


In [3]:
#read the meaningful label file
file_path = ".\DATA\Attributers\wikidata_labels_descriptions_quotebank.csv.bz2"

df_labels = pd.read_csv(file_path, compression = 'bz2', index_col = 'QID')
df_labels.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


In [4]:
#read the people's information file
file_path = ".\DATA\Attributers\speaker_attributes.parquet"

df_people = pd.read_parquet(file_path)
df_people.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


In [None]:
#append 4 colums to the original dataframe which contain people's information
df['Gender'] = 'None'
df['Nationality'] = 'None'
df['Occupation'] = 'None'
df['Party'] = 'None'
for i, row in df.iterrows():
    if i%100 == 0:
        print(i)
    if(row['speaker'] != 'None'):
        res_gender = []
        res_nationality = []
        res_occupation = []
        res_party = []
        for j, qid in enumerate(row['qids']):
            temp_gender = []
            temp_nationality = []
            temp_occupation = []
            temp_party = []
            person = df_people[df_people['id'] == qid]
            if person.shape[0] == 0:
                temp_gender.append('Not Found People')
                temp_nationality.append('Not Found People')
                temp_occupation.append('Not Found People')
                temp_party.append('Not Found People')
            else:
                person = person.iloc[0]
                if isinstance(person['gender'], np.ndarray):
                    for j, label in enumerate(person['gender']):
                        if label in df_labels.index:
                            temp_gender.append(df_labels.loc[label]['Label'])
                        else:
                            temp_gender.append(label+" not in label table")
                else:
                    temp_gender.append('No data')
                
                if isinstance(person['nationality'], np.ndarray):
                    for j, label in enumerate(person['nationality']):
                        if label in df_labels.index:
                            temp_nationality.append(df_labels.loc[label]['Label'])
                        else:
                            temp_nationality.append(label+" not in label table")
                else:
                    temp_nationality.append('No data')
                
                if isinstance(person['occupation'], np.ndarray):
                    for j, label in enumerate(person['occupation']):
                        if label in df_labels.index:
                            temp_occupation.append(df_labels.loc[label]['Label'])
                        else:
                            temp_occupation.append(label+" not in label table")
                else:
                    temp_occupation.append('No data')
                
                if isinstance(person['party'], np.ndarray):
                    for j, label in enumerate(person['party']):
                        if label in df_labels.index:
                            if(label == 'Q29468' or label == 'Q29552'):
                                temp_party.append(df_labels.loc[label]['Label'] + '**')
                            else:
                                temp_party.append(df_labels.loc[label]['Label'])
                        else:
                            temp_party.append(label+" not in label table")
                else:
                    temp_party.append('No data')

            res_gender.append(temp_gender)
            res_nationality.append(temp_nationality)
            res_occupation.append(temp_occupation)
            res_party.append(temp_party)
        
        row['Gender'] = res_gender
        row['Nationality'] = res_nationality
        row['Occupation'] = res_occupation
        row['Party'] = res_party

        df.iloc[i] = row

In [39]:
'''
all appended columns are list type data:
List: Contains the information of all qids corresponding to the speaker
sublist: Contains the information of one particular qid (each qid may have multiple values on one attribute, such as occupation and nationality)

When no speaker is found, the value is 'Not Found People'
When a particular attribute is not found for one speaker, the value is 'No data'
When can't find a clear text for the encoded attribute, the value is 'Qxxxxx not in label table'
'''
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,category,Gender,Nationality,Occupation,Party
0,2018-06-24-002690,And it takes courage because youre going to ge...,Aaron Boone,"[Q4661862, Q4661865]",2018-06-24 03:24:12,2,"[[Aaron Boone, 0.7611], [None, 0.2389]]",[https://www.nytimes.com/2018/06/23/sports/bas...,E,sports,"[[male], [male]]","[[United States of America], [United States of...","[[American football player], [baseball player]]","[[No data], [No data]]"
1,2018-12-19-006360,And we need new rules,Kevin Parker,"[Q11738820, Q20985626, Q20985628, Q6397199, Q6...",2018-12-19 00:24:14,1,"[[Kevin Parker, 0.8207], [None, 0.1793]]",[https://www.nytimes.com/2018/12/18/nyregion/k...,E,uncategorized,"[[male], [male], [male], [male], [male], [male...","[[No data], [No data], [No data], [United Stat...","[[lawyer, judge], [Australian rules football p...","[[No data], [No data], [No data], [Republican ..."
2,2018-10-15-007043,Angela Merkel has governed for an extraordinar...,Wolfgang Schäuble,[Q16019],2018-10-15 23:53:42,1,"[[Wolfgang Schäuble, 0.9016], [None, 0.0878], ...",[https://www.nytimes.com/2018/10/15/world/euro...,E,politics,[[male]],[[Germany]],"[[jurist, politician, lawyer]]",[[Christian Democratic Union]]
3,2018-03-29-009637,Arent insurance companies not allowed to go ba...,Li Yan,"[Q1255395, Q16216647, Q27469859, Q2754794, Q45...",2018-03-29 00:00:00,2,"[[Li Yan, 0.8483], [None, 0.1516]]",[http://www.businesstimes.com.sg/banking-finan...,E,politics,,,,
4,2018-01-23-009447,as intricate as the storytelling is seamless,Manohla Dargis,[Q441327],2018-01-23 06:00:20,2,"[[Manohla Dargis, 0.9017], [None, 0.065], [Jak...",[https://www.nytimes.com/2018/01/23/arts/telev...,E,arts,,,,


In [None]:
#Save the result to a csv file for future use
df.to_csv("nytime_2018_with_people.csv", index_label="index_label")