# GPT Data Cleaning

In [29]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup

import pprintpp
pp = pprintpp.PrettyPrinter(indent=4)

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [30]:
df = pd.read_pickle("../Data-Acquisition/members_gpt.pkl")
df.head()

Unnamed: 0,id,full_title,went_private_school,is_from,ethnicity,is_part_of_lgbtq,said_something_racist,said_something_sexist
0,172,Rt Hon Diane Abbott MP,Rt Hon Diane Abbott MP attended Harrow County ...,"Diane Abbott MP was born in Paddington, London...",Diane Abbott MP is of Jamaican heritage.,"No, Rt Hon Diane Abbott MP is not part of the ...",There is no evidence that Rt Hon Diane Abbott ...,"No, there is no evidence that Rt Hon Diane Abb..."
1,3305,The Lord Aberconway,"The Lord Aberconway attended Eton College, a p...","The Lord Aberconway was born in London, Englan...",Lord Aberconway was a British aristocrat of En...,"No, the Lord Aberconway is not part of the LGB...","No, there is no evidence that Lord Aberconway ...",The Lord Aberconway has not been known to say ...
2,3469,His Grace the Duke of Abercorn,His Grace the Duke of Abercorn attended Eton C...,His Grace the Duke of Abercorn was born in Lon...,The Duke of Abercorn is of Scottish ancestry.,"No, His Grace the Duke of Abercorn does not ap...","No, the Duke of Abercorn has never been known ...","No, His Grace the Duke of Abercorn has not sai..."
3,3468,The Rt Hon. the Lord Aberdare KBE DL,The Rt Hon. the Lord Aberdare KBE DL attended ...,The Rt Hon. the Lord Aberdare KBE DL was born ...,Lord Aberdare was of Welsh descent.,"No, The Rt Hon. the Lord Aberdare KBE DL is no...",The Rt Hon. the Lord Aberdare KBE DL has no re...,There is no evidence that The Rt Hon. the Lord...
4,3898,The Lord Aberdare,"The Lord Aberdare attended Rugby School, a pre...","The Lord Aberdare was born in Merthyr Tydfil, ...","The Lord Aberdare, also known as David Charles...","No, The Lord Aberdare is not part of the LGBTQ...","No, there is no evidence that The Lord Aberdar...",There is no record of Lord Aberdare making any...


## Went private school?

In [31]:
## Private

private_keywords = ['is a private', 'was a private', "attended a private", ', a private', 'the private', "private boarding school", "is a co-educational private school", ", a co-educational private", 'was a co-educational private', "is a coeducational private", ", a coeducational private", "is considered a private school", 
                    'is an independent', ', an independent', 'the independent', ', independent', ", a co-educational independent", ", a coeducational independent", "is a co-educational independent", "is a coeducational independent", "independent boarding school", 
                    "is a selective independent", ", a selective independent", ", a selective", "which is a selective", ", a selective private", 'fee-paying independent', ", a leading independent school", ", a British independent school", 
                    ", a girls' independent school", ", an all-girls private", "the all-girls private", ", all-girls private", "a girls-only selective independent school", "all-boys private school",
                    ", a boys' independent school", ", an all-boys private", ", a boys' private school", ", a boarding and day independent school", 
                    'a Roman Catholic private', 'a Roman Catholic independent', 'a Catholic private', 'is a Catholic independent', 'a Church of England private', 'a Church of England independent',
                    ', a prestigious private', ', a prestigious independent', 'is a prestigious private', 'the prestigious private', "the most prestigious private", 'at home by private tutors']
 
for keyword in private_keywords:
    df.loc[df['went_private_school'].str.contains(keyword), 'went_private_school'] = "1"
    
## Public

public_keywords = ['not a private school', ', a non-private', 'is a non-private', 'was a non-private', 'did not attend a private school', 'is not considered to be private', 'is not considered a private',
                   'was a state', 'is a state', ", a state", "is a state-funded", "was a state-funded", ', a state-run school', "the state-run", "state secondary school", "attended a state school",
                   "is a public", "was a public", ", a public", ", a comprehensive public school", "is a comprehensive public school", ", comprehensive public school", ", a prestigious public school",
                   'a non-fee paying', 'a non-fee-paying', ", non-fee paying", ', non-fee-paying', "both of which are public schools", 'rather than a private school.', "is a comprehensive, public school"]

for keyword in public_keywords:
    df.loc[df['went_private_school'].str.contains(keyword), 'went_private_school'] = "0"
    
## Unknown

unknown_keywords = ['is not possible', "not enough information", "information is not available", "there is no record", "no way to answer", "no answer", "cannot answer", "Unfortunately",
                    "did not attend", "did not go to a secondary school", "is not known to have attended", "does not appear to have attended a secondary school"]


for keyword in unknown_keywords:
    df.loc[df['went_private_school'].str.contains(keyword), 'went_private_school'] = "NA"

df.went_private_school.value_counts(normalize=True)

1     0.541234
0     0.426708
NA    0.032058
Name: went_private_school, dtype: float64

In [32]:
df.loc[df['went_private_school'] == "NA", 'went_private_school'] = 2
df['went_private_school'].value_counts()

1    2448
0    1930
2     145
Name: went_private_school, dtype: int64

## Is from?

In [33]:
### United Kingdom

df.loc[df['is_from'].str.contains("Northern Ireland"), 'is_from'] = "Northern Ireland"
df.loc[df['is_from'].str.contains(", Ireland."), 'is_from'] = "Republic of Ireland"
df.loc[df['is_from'].str.contains("Wales"), 'is_from'] = "Wales"
df.loc[df['is_from'].str.contains("Scotland"), 'is_from'] = "Scotland"

### Abroad

abroad_keywords = ["Germany", 
                   "Egypt", 
                   "United States",
                   "USA", 
                   "Iraq", 
                   "Uganda",
                   "Kenya", 
                   "India", 
                   "Canada",
                   "Austria", 
                   "Greece", 
                   "France", 
                   "Guyana",
                   "New Zealand",
                   "Hong Kong", 
                   "Italy",
                   "Yemen",
                   "Bangladesh", 
                   "South Africa", 
                   "Poland", 
                   "West Indies", 
                   "Jamaica",
                   "Singapore",
                   "Alabama",
                   "island in the English Channel"]

for keyword in abroad_keywords:
    df.loc[df['is_from'].str.contains(keyword), 'is_from'] = "Abroad"
    
### South of England

england_south_keywords = ["is located in the South of England",
                          "is in the South of England.",
                          "England in the South of England.",
                          ", in the South of England.",
                          "in the East of England in the South",
                          "England, in the South.",
                          ", England in the South",
                          ", which is part of the South of England",
                          ", which is in the South.",
                          ", South of England.",
                          "in the South West of England.",
                          "in the South-West of England.",
                          "in the South West",
                          "in the South of England",
                          "in the south of England.",
                          "considered to be part of the South.",
                          "is considered part of the South of England",
                          "London, England",
                          "East Anglia",
                          "Norwich",
                          "Essex",
                          "Ipswich",
                          "Cambridge",
                          "Ilford",
                          "Suffolk"]

for keyword in england_south_keywords:
    df.loc[df['is_from'].str.contains(keyword), 'is_from'] = "England, South"

england_north_keywords = ["England in the North",
                          "is in the North of England",
                          "England, in the North",
                           "which is located in the North.",
                           "which is in the North",
                           "is located in the North of England",
                           ", located in the North.",
                           ", in the North East of England",
                           ", in the North West of England",
                            "is located in the North West of England.", 
                           "of the North West of England",
                           ", which is in North England",
                           "in the North of England."]

for keyword in england_north_keywords:
    df.loc[df['is_from'].str.contains(keyword), 'is_from'] = "England, North"

england_midlands_keywords = ["Midlands of England",
                             ", which is in the Midlands"
                             ", which is located in the Midlands",
                             ", England in the Midlands",
                             "in the East Midlands.",
                             "which is located in the Midlands.",
                             "This is in the Midlands",
                             "city is in the Midlands",
                             "West Midlands, England",
                             ", in the Midlands.",
                             "is located in the East of England",
                             ", located in the Midlands",
                             "is located in the West Midlands",
                             "It is in the Midlands",
                             "It is located in the Midlands",
                             "considered part of the Midlands.",
                             ", which is in the West Midlands",
                             "which is in the Midlands",
                             "in the West Midlands",
                             ", which is considered to be in the Midlands",
                             "in the Midlands.",
                             ", West Midlands",
                             "and the Midlands region"]

for keyword in england_midlands_keywords:
    df.loc[df['is_from'].str.contains(keyword), 'is_from'] = "England, Midlands"

    
unknown_keywords = ["birthplace is not specified",
                    "city of birth is not specified",
                    "place of birth is unknown",
                    "birthplace is unknown",
                    "not possible to answer"]

for keyword in unknown_keywords:
    df.loc[df['is_from'].str.contains(keyword), 'is_from'] = "NA"

df.is_from.value_counts()

England, South         2368
England, North          903
England, Midlands       485
Scotland                407
Wales                   154
Northern Ireland        112
Abroad                   63
Republic of Ireland      24
NA                        7
Name: is_from, dtype: int64

In [34]:
df.loc[df['is_from'].str.contains("England, South"), 'is_from'] = "EnglandSouth"
df.loc[df['is_from'].str.contains("England, North"), 'is_from'] = "EnglandNorth"
df.loc[df['is_from'].str.contains("England, Midlands"), 'is_from'] = "EnglandMidlands"
df.loc[df['is_from'].str.contains("Northern Ireland"), 'is_from'] = "NorthernIreland"
df.loc[df['is_from'].str.contains("Republic of Ireland"), 'is_from'] = "RepublicOfIreland"

df.is_from.value_counts()

EnglandSouth         2368
EnglandNorth          903
EnglandMidlands       485
Scotland              407
Wales                 154
NorthernIreland       112
Abroad                 63
RepublicOfIreland      24
NA                      7
Name: is_from, dtype: int64

## Ethnicity

### The agreed list of ethnic groups taken from the 2021 Census:

**Asian or Asian British**
Indian
Pakistani
Bangladeshi
Chinese
Any other Asian background

**Black, Black British, Caribbean or African**
Caribbean
African
Any other Black, Black British, or Caribbean background

**Mixed or multiple ethnic groups**
White and Black Caribbean
White and Black African
White and Asian
Any other Mixed or multiple ethnic background

**White**
English, Welsh, Scottish, Northern Irish or British
Irish
Gypsy or Irish Traveller
Roma
Any other White background

**Other ethnic group**
Arab
Any other ethnic group

In [35]:
multiple_keywords = ["of mixed heritage",
                     "of mixed ethnicity",
                     "mixed-race",
                     "mixed race",
                     "mixed ancestry",
                     "of English and Jewish heritage",
                     "French and Algerian descent",
                     "Anglo-Indian", 
                     "British and Nigerian",
                     "British and Jamaican",
                     "British and Caribbean",
                     "British and Jewish",
                     "Grenadian and British",
                     "British and Burmese",
                     "British and Japanese",
                     "Filipino and British",
                     "Iranian and British",
                     "Egyptian and British",
                     "Cree and Scottish",
                     "mixed English and Jewish",
                     "Irish and Filipino"]

for keyword in multiple_keywords:
    df.loc[df['ethnicity'].str.contains(keyword), 'ethnicity'] = "Mixed"

black_keywords = ["Jamaican heritage", 
                  "Jamaican origin",
                  "Jamaican descent",
                  "African-American",
                  "African American", 
                  "British-Ghanaian",
                  "African Caribbean",
                  "Nigerian",
                  "Ghanaian",
                  "Afro-Caribbean",
                  "of African descent",
                  "British Caribbean",
                  "Caribbean and African",
                  "British-Jamaican",
                  "Jamaican and Grenadian",
                  "African-Jamaican",
                  "African-Caribbean",
                  "of Caribbean heritage",
                  "British/Jamaican",
                  "Zimbabwean",
                  "of Caribbean descent", 
                  "Ugandan",
                  "British-African"]

for keyword in black_keywords:
    df.loc[df['ethnicity'].str.contains(keyword), 'ethnicity'] = "Black"
    
asian_keywords = ["Iraqi", 
                  "Chinese", 
                  "Indian",
                  "British Pakistani",
                  "Bangladeshi",
                  "Pakistani",
                  "Iranian",
                  "Turkish-Cypriot",
                  "Sri Lanka",
                  "Korean",
                  "Punjabi",
                  "Asian"]

for keyword in asian_keywords:
    df.loc[df['ethnicity'].str.contains(keyword), 'ethnicity'] = "Asian"
    
    
other_keywords = ["is Jewish", 
                  "was Jewish", 
                  "Jewish descent",
                  "Jewish ancestry",
                  "of Jewish heritage",
                  "Jewish British",
                  "of Jewish ethnicity",
                  "of British Jewish heritage",
                  "British Jewish",
                  "Hungarian-Jewish",
                  "British-Jewish",
                  "a British Jew",
                  "to a Jewish family",
                  "Native American",
                  "Colombian and Venezuelan",
                  "of Māori descent",
                  "Hispanic",
                  "Puerto Rican",
                  "Nicaragua"]

for keyword in other_keywords:
    df.loc[df['ethnicity'].str.contains(keyword), 'ethnicity'] = "Other"
    
white_keywords = ["white",
                  "White",
                  "Caucasian",
                  "British",
                  "English",
                  "Scottish",
                  "Welsh",
                  "Irish",
                  "Ireland",
                  "American",
                  "Anglo-Irish",
                  "Catholic",
                  "Italian",
                  "Englishman",
                  "Anglo-Saxon",
                  "European",
                  "Greek",
                  "Spanish",
                  "Austrian",
                  "Dutch",
                  "Polish",
                  "German",
                  "Russian",
                  "French",
                  "Cypriot",
                  "Belgian",
                  "Swedish",
                  "Hungarian",
                  "Estonian",
                  "French",
                  "Australian",
                  "New Zealand",
                  "South African",
                  "Canadian"]
                  
for keyword in white_keywords:
    df.loc[df['ethnicity'].str.contains(keyword, regex=False), 'ethnicity'] = "White"
    
unknown_keywords = ["not enough information",
                    "without more information",
                    "does not have an identified ethnicity.",
                    "ethnicity is unknown", 
                    "does not have a specific ethnicity.",
                    "ethnicity is not specified.",
                    "ethnicity is not known.",
                    "ethnicity is not publicly known"
                    "does not have a publicly disclosed ethnicity",
                    "does not appear to have a specified ethnicity",
                    "does not have an ethnicity",
                    "has no known ethnicity",
                    "does not have a specific reported ethnicity",
                    "no definitive answer",
                    "does not disclose",
                    "not publicly known",
                    "it is not possible",
                    "has no ethnicity",
                    "does not have a publicly known ethnicity",
                    "does not have a",
                    "does not publicly disclose",
                    "does not appear to have",
                    "has not disclosed",
                    "no information available",
                    "is not publicly specified",
                    "is a fictitious character",
                    "is not publicly available",
                    "no record",
                    "no specific ethnicity",
                    "has not specified",
                    "no known ethnic background",
                    "not explicitly stated",
                    "further information",
                    "impossible to determine",
                    "no such person",
                    "does not list",
                    "more specific information",
                    "not possible",
                    "not a real person",
                    "a fictional character",
                    "not associated with a specific ethnicity",
                    "does not publicly discuss",
                    "does not exist",
                    "not known",
                    "is unknown",
                    "no known",
                    "not publicly disclosed",
                    "a British character"]

for keyword in unknown_keywords:
    df.loc[df['ethnicity'].str.contains(keyword), 'ethnicity'] = "NA"
    
df.ethnicity.value_counts()

White    3384
NA        469
Black     239
Mixed     145
Asian     143
Other     143
Name: ethnicity, dtype: int64

In [36]:
df.loc[df['ethnicity'].str.contains("Black"), 'ethnicity'] = "BlackBlackBritishCaribbeanOrAfrican"
df.loc[df['ethnicity'].str.contains("Mixed"), 'ethnicity'] = "MixedOrMultipleEthnic groups"
df.loc[df['ethnicity'].str.contains("Other"), 'ethnicity'] = "OtherEthnicGroup"
df.loc[df['ethnicity'].str.contains("Asian"), 'ethnicity'] = "AsianOrAsianBritish"

df.ethnicity.value_counts()

White                                  3384
NA                                      469
BlackBlackBritishCaribbeanOrAfrican     239
MixedOrMultipleEthnic groups            145
AsianOrAsianBritish                     143
OtherEthnicGroup                        143
Name: ethnicity, dtype: int64

## LGBTQ?

In [37]:
yes_keywords = ["Yes, "]

for keyword in yes_keywords:
    df.loc[df['is_part_of_lgbtq'].str.contains(keyword), 'is_part_of_lgbtq'] = "1"
    
no_keywords = ["No"]

for keyword in no_keywords:
    df.loc[df['is_part_of_lgbtq'].str.contains(keyword), 'is_part_of_lgbtq'] = "0"
    
df.is_part_of_lgbtq.value_counts(normalize=True)

0    0.998673
1    0.001327
Name: is_part_of_lgbtq, dtype: float64

## Racist?

In [38]:
df.loc[df['said_something_racist'].str.startswith("Yes"), 'said_something_racist'] = "yes"

yes_keywords = ["has been accused of",
                "a series of offensive comments"]

for keyword in yes_keywords:
    df.loc[df['said_something_racist'].str.contains(keyword), 'said_something_racist'] = "yes"
    

controversial_keywords = ["controversial", 
                          ", been accused of racism",
                          "criticized",
                          "has, however,"]

for keyword in controversial_keywords:
    df.loc[df['said_something_racist'].str.contains(keyword), 'said_something_racist'] = "controversial"

df.loc[df['said_something_racist'].str.startswith("No"), 'said_something_racist'] = "no"

    
no_keywords = ["has never been reported as",
               "has not been known to",
               "has never been known to",
               "here is no record",
               "has not been recorded",
               "has never been recorded",
               "no evidence",
               "not known",
               "has not been reported",
               "has not publicly said",
               "has never made any public",
               "has not been publicly reported",
               "has not been publicly known to",
               "has never said anything",
               "has never been publicly associated with",
               "has never been publicly accused",
               "has not been publicly accused of",
               "has never been accused",
               "has not been accused of",
               "has never made any public comments",
               "has not been recorded as",
               "has never been reported",
               "never made any public statements",
               "has not said anything",
               "has not been documented",
               "has not made any public statements",
               "not possible to answer this question",
               "has not been publicly linked to",
               "no available evidence",
               "was never known to",
               "has never publicly",
               "no known record",
               "no indication",
               "no public record",
               "has not made any",
               "has made no",
               "has not been involved",
               "has not publicly",
               "have not said anything publicly",
               "has not been linked to any",
               "has never been linked to any",
               "has not been associated",
               "has not been accused",
               "no way to answer",
               "has no record of",
               "has not been directly reported",
               "is not clear",
               "it is unlikely",
               "has never been publicly reported",
               "unable to find",
               "has not said or written",
               "has not been publicly",
               "has not been widely reported",
               "never said anything publicly",
               "No."]

for keyword in no_keywords:
    df.loc[df['said_something_racist'].str.contains(keyword), 'said_something_racist'] = "no"

df.said_something_racist.value_counts()


no               4476
controversial      30
yes                17
Name: said_something_racist, dtype: int64

In [39]:
df.loc[df['said_something_racist'].str.contains("controversial"), 'said_something_racist'] = "2"
df.loc[df['said_something_racist'].str.contains("yes"), 'said_something_racist'] = "1"
df.loc[df['said_something_racist'].str.contains("no"), 'said_something_racist'] = "0"

df.said_something_racist.value_counts()

0    4476
2      30
1      17
Name: said_something_racist, dtype: int64

## Sexist?

In [40]:
df.loc[df['said_something_sexist'].str.startswith("Yes"), 'said_something_sexist'] = "yes"

yes_keywords = ["was accused of holding sexist opinions",
                "has been accused of making sexist comments",
                "was accused of sexism",
                "has been accused of sexism",
                "accused him of being sexist",
                "accused her of being sexist",
                "has made several remarks",
                "has made several comments",
                "making offensive comments"
                "referred to a female"]

for keyword in yes_keywords:
    df.loc[df['said_something_sexist'].str.contains(keyword), 'said_something_sexist'] = "yes"

controversial_keywords = ["has been criticized",
                          "have been criticized",
                          "has been criticised",
                          "have been criticised",
                          "controversial comment",
                          "controversial statement",
                          "controversial remark",
                          "voted against",
                          "voting against",
                          "opposed a change"]

for keyword in controversial_keywords:
    df.loc[df['said_something_sexist'].str.contains(keyword), 'said_something_sexist'] = "controversial"
    
df.loc[df['said_something_sexist'].str.startswith("No"), 'said_something_sexist'] = "no"
    
no_keywords = ['any evidence',
               'any record',
               'not make any',
               'not made any',
               'does not appear',
               'ever saying',
               'has never publicly',
               'has never said',
               'has not been',
               'has not been documented',
               'has not been linked to',
               'has not been publicly',
               'has not made any',
               'has not said',
               'has not said or expressed',
               'held any opinions',
               'held any sexist opinions',
               'is not possible',
               'never been',
               'never been accused of',
               'never been known to',
               'never expressed any',
               'never made any',
               'never publicly expressed',
               'never publicly said',
               'never said or expressed',
               'no evidence',
               'no indication',
               'no information',
               'no known history',
               'no public record',
               'not said anything sexist',
               'no record',
               'no report',
               'not appear to have',
               'not been accused of',
               'not been known to',
               'not been publicly associated',
               'not been recorded',
               'not been reported',
               'not expressed any',
               'not known',
               'not made any public statements',
               'not publicly',
               'not said anything publicly',
               'not widely known',
               'not, to our knowledge',
               'o record',
               'or held any known sexist opinions',
               'or held sexist opinions',
               'or hold any sexist opinions',
               'progressive',
               'promoting diversity and gender equality',
               'promoting gender equality',
               'strong supporter',
               'support for women',
               'advocate',
               'believer in gender equality',
               'champion',
               'fictional character',
               'long record of advocating']

for keyword in no_keywords:
    df.loc[df['said_something_sexist'].str.contains(keyword), 'said_something_sexist'] = "no"

df.said_something_sexist.value_counts()

no               4453
yes                39
controversial      31
Name: said_something_sexist, dtype: int64

In [45]:
df.loc[df['said_something_sexist'].str.contains("controversial"), 'said_something_sexist'] = "2"
df.loc[df['said_something_sexist'].str.contains("yes"), 'said_something_sexist'] = "1"
df.loc[df['said_something_sexist'].str.contains("no"), 'said_something_sexist'] = "0"

df.said_something_sexist.value_counts()

0    4453
1      39
2      31
Name: said_something_sexist, dtype: int64

## Saving the dataset

In [42]:
df.head()

Unnamed: 0,id,full_title,went_private_school,is_from,ethnicity,is_part_of_lgbtq,said_something_racist,said_something_sexist
0,172,Rt Hon Diane Abbott MP,0,EnglandSouth,BlackBlackBritishCaribbeanOrAfrican,0,0,0
1,3305,The Lord Aberconway,1,EnglandSouth,White,0,0,0
2,3469,His Grace the Duke of Abercorn,1,EnglandSouth,White,0,0,0
3,3468,The Rt Hon. the Lord Aberdare KBE DL,1,Wales,White,0,0,0
4,3898,The Lord Aberdare,1,Wales,White,0,0,0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4523 entries, 0 to 4762
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     4523 non-null   int64 
 1   full_title             4523 non-null   object
 2   went_private_school    4523 non-null   object
 3   is_from                4523 non-null   object
 4   ethnicity              4523 non-null   object
 5   is_part_of_lgbtq       4523 non-null   object
 6   said_something_racist  4523 non-null   object
 7   said_something_sexist  4523 non-null   object
dtypes: int64(1), object(7)
memory usage: 318.0+ KB


# Data Dictionary

- **id** - member's id as featured on the gov-t API
- **full_title** - member's full title as featured on the gov-t API
- **went_private_school** - whether the member went to private school (0: no, 1: yes, 2: information NA on GPT)
- **is_from** - where the member was born
- **ethnicity** - what ethnicity a member is (labelled according to the 2021 Census)
- **is_part_of_lgbtq** - whether the member is part of the LGBTQ+ community (0: no, 1: yes)
- **said_something_racist** - whether a member has been publically accused of saying something racist (0: no, 1: yes, 2: not definitively racist, but controversial)
- **said_something_sexist** - whether a member has been publically accused of saying something sexist (0: no, 1: yes, 2: not definitively sexist, but controversial)

In [46]:
df.to_pickle("cleaned_members_gpt.pkl")