# The Bachelor & Race
* **Filename**: clean_data.ipynb
* **Author**: Angelina Li
* **Date**: 08/22/2018
* **Description**: Clean data for use in other notebooks
* **Input**: 538 data, wikipedia data, karenx data
* **Output**: Person-level (leads + contestants) dataset with season-level data, flags for their results in the game and a flag for identified POC.

### Sections
1. [Reformat 538 Bachelorette data](#reformat)
2. [Merge with Karenx's Race data](#merge-race)
3. [Create Master Dataset](#master-dataset)

In [1]:
import re
import requests
import pandas as pd
import os

from bs4 import BeautifulSoup

In [2]:
# name key directories

input_dir = "../input"
intermed_dir = "../intermediate"
output_dir = "../output"
for file_dir in [input_dir, intermed_dir, output_dir]:
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

<a id="reformat"></a>
### Reformat 538 Bachelorette Data
* In order to get race data (even via visual inspection), we need a cleaned, candidate level dataset of bachelor/ette contestants & lead roles.
* **Objective: Get unique list of contestant names and ids**

In [3]:
# import in 538 data
path_538 = "../input/538/bachelorette.csv"
df_538 = pd.read_csv(path_538)

print(df_538.columns)
df_538.head()

Index(['SHOW', 'SEASON', 'CONTESTANT', 'ELIMINATION-1', 'ELIMINATION-2',
       'ELIMINATION-3', 'ELIMINATION-4', 'ELIMINATION-5', 'ELIMINATION-6',
       'ELIMINATION-7', 'ELIMINATION-8', 'ELIMINATION-9', 'ELIMINATION-10',
       'DATES-1', 'DATES-2', 'DATES-3', 'DATES-4', 'DATES-5', 'DATES-6',
       'DATES-7', 'DATES-8', 'DATES-9', 'DATES-10'],
      dtype='object')


Unnamed: 0,SHOW,SEASON,CONTESTANT,ELIMINATION-1,ELIMINATION-2,ELIMINATION-3,ELIMINATION-4,ELIMINATION-5,ELIMINATION-6,ELIMINATION-7,...,DATES-1,DATES-2,DATES-3,DATES-4,DATES-5,DATES-6,DATES-7,DATES-8,DATES-9,DATES-10
0,SHOW,SEASON,ID,1,2,3,4,5,6,7,...,1.0,2,3,4,5,6,7,8,9,10
1,Bachelorette,13,13_BRYAN_A,R1,,,R,R,,R,...,,,D6,D13,D1,D7,D1,D1,D1,D1
2,Bachelorette,13,13_PETER_K,,R,,,,R,R,...,,D1,D6,D13,D9,D7,D1,D1,D1,D1
3,Bachelorette,13,13_ERIC_B,,,R,,,R,R,...,,D10,D8,D13,D9,D1,D3,D1,D1,
4,Bachelorette,13,13_DEAN_U,,R,,R,,,R,...,,D8,D8,D1,D9,D7,D1,D1,,


In [4]:
# 1. drop non data rows; clean columns
def clean_column(name):
    name = name.lower()
    changemap = {"elimination-": "e", "dates-": "d"}
    for curr, new in changemap.items():
        name = name.replace(curr, new)
    return name

df_538 = df_538[~(df_538["CONTESTANT"] == "ID") & 
                    ~(df_538["SEASON"] == "SEASON")] \
                   .rename(columns={"CONTESTANT": "cid"})
df_538.columns = map(clean_column, df_538.columns)

print(df_538.columns)
df_538.head()

Index(['show', 'season', 'cid', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8',
       'e9', 'e10', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
       'd10'],
      dtype='object')


Unnamed: 0,show,season,cid,e1,e2,e3,e4,e5,e6,e7,...,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
1,Bachelorette,13,13_BRYAN_A,R1,,,R,R,,R,...,,,D6,D13,D1,D7,D1,D1,D1,D1
2,Bachelorette,13,13_PETER_K,,R,,,,R,R,...,,D1,D6,D13,D9,D7,D1,D1,D1,D1
3,Bachelorette,13,13_ERIC_B,,,R,,,R,R,...,,D10,D8,D13,D9,D1,D3,D1,D1,
4,Bachelorette,13,13_DEAN_U,,R,,R,,,R,...,,D8,D8,D1,D9,D7,D1,D1,,
5,Bachelorette,13,13_ADAM_G,,,,,,,ED,...,,D10,D8,D13,D9,D7,D3,,,


In [5]:
# We also have another dataset of data for Bachelor seasons 22 and 14 - Let's import it in.
path_new_seasons = os.path.join(input_dir, "new_seasons.xlsx")
df_new_seasons = pd.read_excel(path_new_seasons)
df_new_seasons.head()

Unnamed: 0,show,season,name,e1,e2,e3,e4,e5,e6,e7,...,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
0,Bachelorette,14,Garrett Yrigoyen,R1,,,R,,,R,...,,D10,D10,D1,D9,D7,D1,D1,D1,D1
1,Bachelorette,14,Blake Horstmann,,R,,,R,,R,...,,D1,D10,D13,D9,D7,D1,D1,D1,D1
2,Bachelorette,14,Jason Tartick,,,,,,R,R,...,,,D6,D13,D9,D1,D3,D1,D1,
3,Bachelorette,14,Colton Underwood,,,R,,R,R,R,...,,D10,D6,D13,D1,D7,D1,D1,,
4,Bachelorette,14,Wills Reid,,R,,R,,,ED,...,,D10,D6,D1,D9,D7,D3,,,


In [6]:
# Now we can clean this up and add it in
def get_cid(row):
    season = row["season"]
    name = row["name"].split()
    f_name = name[:-1] # list
    l_init = name[-1][0]
    return "_".join([str(season)] + f_name + [l_init]).upper()

df_new_seasons["cid"] = df_new_seasons.apply(get_cid, axis=1)
df_538 = pd.concat([df_538, df_new_seasons])
df_538.tail()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e3,e4,e5,e6,e7,e8,e9,name,season,show
52,22_BRITTANE_J,,,,,,,,,,...,,,,,,,,Brittane Johnson,22,Bachelor
53,22_JESSICA_C,,,,,,,,,,...,,,,,,,,Jessica Carroll,22,Bachelor
54,22_LAUREN_J,,,,,,,,,,...,,,,,,,,Lauren Jarreau,22,Bachelor
55,22_NYSHA_N,,,,,,,,,,...,,,,,,,,Nysha Norris,22,Bachelor
56,22_OLIVIA_G,,,,,,,,,,...,,,,,,,,Olivia Goethals,22,Bachelor


In [7]:
# it seems like there are several duplicated values
df_538["dup"] = df_538[["cid", "season", "show"]].duplicated()
dup_cid = df_538[df_538.cid.isin(df_538[df_538["dup"]].cid)]
dup_cid

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e4,e5,e6,e7,e8,e9,name,season,show,dup
172,07_RYAN_M,,,D1,,D10,D8,D6,,,...,,,EQ,,,,,7,Bachelorette,False
181,07_RYAN_M,,,,,,,,,,...,,,,,,,,7,Bachelorette,True
847,03_TINA_X,,,D5,D5,D3,D1,D1,,,...,,,E,,,,,3,Bachelor,False
854,03_TINA_X,,,D5,D5,,,,,,...,,,,,,,,3,Bachelor,True
856,03_AMY_X,,,D5,,,,,,,...,,,,,,,,3,Bachelor,False
861,03_AMY_X,,,D5,,,,,,,...,,,,,,,,3,Bachelor,True
882,02_ERIN_X,,,D5,,,,,,,...,,,,,,,,2,Bachelor,False
890,02_ERIN_X,,,,,,,,,,...,,,,,,,,2,Bachelor,True


From manual inspection and reading the wikipedia pages associated with these seasons, it seems like these are each different people with the same name and last initial. For our purposes, we can assume each are unique people, and assign each duplicate a new, unique CID

In [8]:
# 3. extract contestant name; clean errors
def clean_cid_errors(cid):
    changemap = {
        "06_ROBERT_M": "06_ROBERTO_M",
        "17_SLEMA_A": "17_SELMA_A",
        "09_JUAN_G": "09_JUAN_PABLO_G"
    }
    return changemap.get(cid, cid)
    
def get_new_cid(row):
    cid = row["cid"]
    new_cid = cid + "2" if row["dup"] else cid  # dedup cids
    show_stub = "BE_" if row["show"] == "Bachelorette" else "BA_"
    return show_stub + new_cid

get_name = lambda cid: " ".join([x.capitalize() for x in cid.split("_")[1:]])
get_fname = lambda name: " ".join(name.split()[:-1])

df_538.cid = df_538.cid.map(clean_cid_errors)
df_538["name"] = df_538.cid.map(get_name)
df_538["f_name"] = df_538.name.map(get_fname)
df_538.cid = df_538.apply(get_new_cid, axis=1)

df_538 = df_538.drop(["dup"], axis=1)

df_538.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e4,e5,e6,e7,e8,e9,name,season,show,f_name
1,BE_13_BRYAN_A,,D1,,D6,D13,D1,D7,D1,D1,...,R,R,,R,,,Bryan A,13,Bachelorette,Bryan
2,BE_13_PETER_K,,D1,D1,D6,D13,D9,D7,D1,D1,...,,,R,R,,,Peter K,13,Bachelorette,Peter
3,BE_13_ERIC_B,,,D10,D8,D13,D9,D1,D3,D1,...,,,R,R,,E,Eric B,13,Bachelorette,Eric
4,BE_13_DEAN_U,,,D8,D8,D1,D9,D7,D1,D1,...,R,,,R,E,,Dean U,13,Bachelorette,Dean
5,BE_13_ADAM_G,,,D10,D8,D13,D9,D7,D3,,...,,,,ED,,,Adam G,13,Bachelorette,Adam


<a id="merge-race"></a>
### Merge in Race Data & Extract Wikipedia Data

* [karenx](http://www.karenx.com/blog/minorities-on-the-bachelor-when-do-they-get-eliminated/)'s fantastic blogpost lists candidates based on their first name, season year and lead. We want to match this data up to the data we already have from 538.
* **Objective: Merge karenx's data with 538 df using wikipedia data**

In [9]:
# import in karenx data
path_kx = os.path.join(input_dir, "race", "karenx_data.csv")
df_kx = pd.read_csv(path_kx, sep=", ")

df_kx.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,f_name,year,lead
0,Julie,2009,Jason Mesnick
1,Greg,2009,Jillian Harris
2,Channy,2010,Jake Pavelka
3,Roberto,2010,Ali Fedotowsky
4,Dianna,2012,Ben Flajnik


In [10]:
# clean karenx data and correct mistakes
def clean_fname_errors(f_name):
    changemap = {
        "Kupa": "Kupah",
        "Jo-Jo": "Jojo"
    }
    return changemap.get(f_name, f_name)

df_kx["poc_flag"] = True
df_kx.f_name = df_kx.f_name.map(clean_fname_errors)
df_kx.head()

Unnamed: 0,f_name,year,lead,poc_flag
0,Julie,2009,Jason Mesnick,True
1,Greg,2009,Jillian Harris,True
2,Channy,2010,Jake Pavelka,True
3,Roberto,2010,Ali Fedotowsky,True
4,Dianna,2012,Ben Flajnik,True


In [11]:
# grab season, year, show, lead data from wikipedia
def get_page_soup(url):
    try:
        resp = requests.get(url)
        page_text = resp.text
        return BeautifulSoup(page_text, "html.parser")
    except requests.exceptions.RequestException as e:
        print("Couldn't find soup object for url", url)

def get_seasons(url):
    soup = get_page_soup(url)
    return soup.find(id="Seasons").find_parent("h2") \
               .find_next_sibling("table", class_="wikitable")

def get_season_data(url):
    """ returns headers and data associated with seasons data for soup obj """
    seasons = get_seasons(url)
    header_row = seasons.find("tr")
    headers = [header.text.strip() for header in header_row.find_all("th")]
    data_rows = header_row.find_next_siblings("tr")
    return headers, data_rows

def get_data_df(url):
    headers, data_rows = get_season_data(url)
    all_data = []
    for row in data_rows:
        data = [val.text.strip() for val in row.find_all("td")]
        if not data[3].isnumeric():
            num_contest = all_data[-1].get("Number of contestants")
            data = data[:3] + [num_contest] + data[3:]
        data_dict = dict(zip(headers, data))
        all_data.append(data_dict)
    return pd.DataFrame(all_data)

wiki_ba = "https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)"
wiki_be = "https://en.wikipedia.org/wiki/The_Bachelorette"

df_ba = get_data_df(wiki_ba)
df_be = get_data_df(wiki_be)

df_ba.head(n=3)

Unnamed: 0,#,Bachelor,Number of contestants,Original Run,Proposal,Relationship notes,Runner(s)-up,Still together,Winner
0,1,Alex Michel,25,"March 25–April 25, 2002",No,"Michel did not propose to Marsh, but instead t...",Trista Rehn,No,Amanda Marsh
1,2,Aaron Buerge,25,"September 25–November 20, 2002",Yes,Buerge and Eksterowicz broke up after several ...,Brooke Smith,No,Helene Eksterowicz
2,3,Andrew Firestone,25,"March 24–May 21, 2003",Yes,Schefft and Firestone broke up after several m...,Kirsten Buschbacher,No,Jen Schefft


In [12]:
df_be.head(n=3)

Unnamed: 0,#,Bachelorette,Number of contestants,Original run,Proposal,Relationship,Runner-up,Still together,Winner
0,1,Trista Rehn,25,"January 8–February 19, 2003",Yes,"Rehn and Sutter were married on December 6, 20...",Charlie Maher,Yes,Ryan Sutter
1,2,Meredith Phillips,25,"January 14–February 26, 2004",Yes,Phillips and McKee were engaged at the end of ...,Matthew Hickl,No,Ian Mckee
2,3,Jen Schefft,25,"January 10–February 28, 2005",Yes[a],"During the first live final rose ceremony, Sch...",John Paul Merritt,No,Jerry Ferris


In [13]:
# clean and concatenate datasets
wikiframes = [df_ba, df_be]
for df in wikiframes:
    df["show"] = "Bachelorette" if "Bachelorette" in df.columns else "Bachelor"
    df.columns = ["season", "lead", "num_contestants", "original_run", 
                  "proposal", "notes", "runner_up", "still_together", 
                  "winner", "show"]
    
    clean_fn = lambda x: re.sub("\[\d+\]", "", x) # remove footnotes
    get_year = lambda dt: int(clean_fn(dt.split()[-1]))
    clean_lead = lambda lead: re.sub("[^a-zA-Z0-9\s]+", "", clean_fn(lead))
    
    df["year"] = df["original_run"].map(get_year)
    df.lead = df.lead.map(clean_lead)

df_wiki = pd.concat(wikiframes)
df_wiki.head(n=3)

Unnamed: 0,season,lead,num_contestants,original_run,proposal,notes,runner_up,still_together,winner,show,year
0,1,Alex Michel,25,"March 25–April 25, 2002",No,"Michel did not propose to Marsh, but instead t...",Trista Rehn,No,Amanda Marsh,Bachelor,2002
1,2,Aaron Buerge,25,"September 25–November 20, 2002",Yes,Buerge and Eksterowicz broke up after several ...,Brooke Smith,No,Helene Eksterowicz,Bachelor,2002
2,3,Andrew Firestone,25,"March 24–May 21, 2003",Yes,Schefft and Firestone broke up after several m...,Kirsten Buschbacher,No,Jen Schefft,Bachelor,2003


In [14]:
# the full wiki dataset is too bulky - select specific factors to look at
df_wiki_cols = ["show", "season", "year", "lead", "num_contestants"]
df_wiki_flags = df_wiki.copy()[df_wiki_cols]
df_wiki_flags.head()

Unnamed: 0,show,season,year,lead,num_contestants
0,Bachelor,1,2002,Alex Michel,25
1,Bachelor,2,2002,Aaron Buerge,25
2,Bachelor,3,2003,Andrew Firestone,25
3,Bachelor,4,2003,Bob Guiney,25
4,Bachelor,5,2004,Jesse Palmer,25


In [15]:
# reassign types to merge
for df in [df_kx, df_wiki_flags]:
    df.year = df.year.astype(int)
    df.lead = df.lead.astype(str)
    print(df.lead.unique())

['Jason Mesnick' 'Jillian Harris' 'Jake Pavelka' 'Ali Fedotowsky'
 'Ben Flajnik' 'Emily Maynard' 'Sean Lowe' 'Desiree Hartsock'
 'Juan Pablo Galavis' 'Andi Dorfman' 'Chris Soules' 'Kaitlyn Bristowe'
 'Ben Higgins']
['Alex Michel' 'Aaron Buerge' 'Andrew Firestone' 'Bob Guiney'
 'Jesse Palmer' 'Byron Velvick' 'Charlie OConnell' 'Travis Lane Stork'
 'Lorenzo Borghese' 'Andrew Baldwin' 'Brad Womack' 'Matt Grant'
 'Jason Mesnick' 'Jake Pavelka' 'Ben Flajnik' 'Sean Lowe'
 'Juan Pablo Galavis' 'Chris Soules' 'Ben Higgins' 'Nick Viall'
 'Arie Luyendyk Jr' 'Trista Rehn' 'Meredith Phillips' 'Jen Schefft'
 'DeAnna Pappas' 'Jillian Harris' 'Ali Fedotowsky' 'Ashley Hebert'
 'Emily Maynard' 'Desiree Hartsock' 'Andi Dorfman' 'Kaitlyn Bristowe'
 'Joelle JoJo Fletcher' 'Rachel Lindsay' 'Becca Kufrin']


In [16]:
# merge on Karen x data
df_kx_merged = pd.merge(df_kx, df_wiki_flags, how="left", 
                        on=["year", "lead"])
df_kx_merged.head()

Unnamed: 0,f_name,year,lead,poc_flag,show,season,num_contestants
0,Julie,2009,Jason Mesnick,True,Bachelor,13,25
1,Greg,2009,Jillian Harris,True,Bachelorette,5,30
2,Channy,2010,Jake Pavelka,True,Bachelor,14,25
3,Roberto,2010,Ali Fedotowsky,True,Bachelorette,6,25
4,Dianna,2012,Ben Flajnik,True,Bachelor,16,25


In [17]:
# standardize types to merge
for df in [df_kx_merged, df_538]:
    df.season = df.season.astype(int)

In [18]:
# merge karen x data with 538 data
df_kx_538 = pd.merge(df_538, df_kx_merged, how="outer", 
                     on=["f_name", "season", "show"])

df_kx_538.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e8,e9,name,season,show,f_name,year,lead,poc_flag,num_contestants
0,BE_13_BRYAN_A,,D1,,D6,D13,D1,D7,D1,D1,...,,,Bryan A,13,Bachelorette,Bryan,,,,
1,BE_13_PETER_K,,D1,D1,D6,D13,D9,D7,D1,D1,...,,,Peter K,13,Bachelorette,Peter,,,,
2,BE_13_ERIC_B,,,D10,D8,D13,D9,D1,D3,D1,...,,E,Eric B,13,Bachelorette,Eric,,,,
3,BE_13_DEAN_U,,,D8,D8,D1,D9,D7,D1,D1,...,E,,Dean U,13,Bachelorette,Dean,,,,
4,BE_13_ADAM_G,,,D10,D8,D13,D9,D7,D3,,...,,,Adam G,13,Bachelorette,Adam,,,,


In [19]:
# check if the merge happened properly
df_kx_missing = df_kx_538[df_kx_538["cid"].isnull()]
df_kx_missing[["show", "season", "year", "f_name", "lead"]]

Unnamed: 0,show,season,year,f_name,lead


<a id="master-dataset"></a>
### Create person level dataset w/ all flags

* Now we have some incomplete race data merged in, we want to populate the rest of the data with the wikipedia flags we have access to.
* Then, we can save this as an intermediate dataset for later use.
* **Objective: Create a person-level dataset (contestants and leads) with all flags available **

In [20]:
# get a look at our data
print(df_kx_538.columns)
df_kx_538.head()

Index(['cid', 'd1', 'd10', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
       'e1', 'e10', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'name',
       'season', 'show', 'f_name', 'year', 'lead', 'poc_flag',
       'num_contestants'],
      dtype='object')


Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e8,e9,name,season,show,f_name,year,lead,poc_flag,num_contestants
0,BE_13_BRYAN_A,,D1,,D6,D13,D1,D7,D1,D1,...,,,Bryan A,13,Bachelorette,Bryan,,,,
1,BE_13_PETER_K,,D1,D1,D6,D13,D9,D7,D1,D1,...,,,Peter K,13,Bachelorette,Peter,,,,
2,BE_13_ERIC_B,,,D10,D8,D13,D9,D1,D3,D1,...,,E,Eric B,13,Bachelorette,Eric,,,,
3,BE_13_DEAN_U,,,D8,D8,D1,D9,D7,D1,D1,...,E,,Dean U,13,Bachelorette,Dean,,,,
4,BE_13_ADAM_G,,,D10,D8,D13,D9,D7,D3,,...,,,Adam G,13,Bachelorette,Adam,,,,


In [21]:
# use wiki data to fill in holes
df_wiki_merge = df_wiki_flags.copy()
for df in [df_kx_538, df_wiki_merge]:
    df.season = df.season.astype(int)

df_wiki_merge.columns = map(lambda col: col + "_new", df_wiki_flags.columns)
df_all_flags = pd.merge(df_kx_538, df_wiki_merge, how="inner", 
                        left_on=["season", "show"],
                        right_on=["season_new", "show_new"])

df_all_flags.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,f_name,year,lead,poc_flag,num_contestants,show_new,season_new,year_new,lead_new,num_contestants_new
0,BE_13_BRYAN_A,,D1,,D6,D13,D1,D7,D1,D1,...,Bryan,,,,,Bachelorette,13,2017,Rachel Lindsay,31
1,BE_13_PETER_K,,D1,D1,D6,D13,D9,D7,D1,D1,...,Peter,,,,,Bachelorette,13,2017,Rachel Lindsay,31
2,BE_13_ERIC_B,,,D10,D8,D13,D9,D1,D3,D1,...,Eric,,,,,Bachelorette,13,2017,Rachel Lindsay,31
3,BE_13_DEAN_U,,,D8,D8,D1,D9,D7,D1,D1,...,Dean,,,,,Bachelorette,13,2017,Rachel Lindsay,31
4,BE_13_ADAM_G,,,D10,D8,D13,D9,D7,D3,,...,Adam,,,,,Bachelorette,13,2017,Rachel Lindsay,31


In [22]:
# for relevant cols, replace missing data
def replace_wiki_values(row):
    replace_cols = ["year", "lead", "num_contestants"]
    new_row = []
    for col in row.index:
        new_var = row[col + "_new"] if col in replace_cols else row[col]
        new_row.append(new_var)
    return new_row

df_all_flags = df_all_flags.apply(replace_wiki_values, axis=1) \
                           .drop(labels=df_wiki_merge.columns, axis=1) \
                           .drop(labels=["f_name"], axis=1)
        
print(df_all_flags.columns)
df_all_flags.head()

Index(['cid', 'd1', 'd10', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
       'e1', 'e10', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'name',
       'season', 'show', 'year', 'lead', 'poc_flag', 'num_contestants'],
      dtype='object')


Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e7,e8,e9,name,season,show,year,lead,poc_flag,num_contestants
0,BE_13_BRYAN_A,,D1,,D6,D13,D1,D7,D1,D1,...,R,,,Bryan A,13,Bachelorette,2017,Rachel Lindsay,,31
1,BE_13_PETER_K,,D1,D1,D6,D13,D9,D7,D1,D1,...,R,,,Peter K,13,Bachelorette,2017,Rachel Lindsay,,31
2,BE_13_ERIC_B,,,D10,D8,D13,D9,D1,D3,D1,...,R,,E,Eric B,13,Bachelorette,2017,Rachel Lindsay,,31
3,BE_13_DEAN_U,,,D8,D8,D1,D9,D7,D1,D1,...,R,E,,Dean U,13,Bachelorette,2017,Rachel Lindsay,,31
4,BE_13_ADAM_G,,,D10,D8,D13,D9,D7,D3,,...,ED,,,Adam G,13,Bachelorette,2017,Rachel Lindsay,,31


In [23]:
# create a person-level dataset for lead candidates from wiki data
def get_cid(row):
    show_stub = "BE" if row["show"] == "Bachelorette" else "BA"
    season = row["season"]
    name = row["name"].split()
    f_name = name[:-1] # list
    l_init = name[-1][0]
    return "_".join([show_stub, season] + f_name + [l_init, "L"]).upper()

df_leads = df_wiki_flags.copy()
df_leads["name"] = df_leads["lead"]
df_leads["cid"] = df_leads.apply(get_cid, axis=1)

df_leads.head()

Unnamed: 0,show,season,year,lead,num_contestants,name,cid
0,Bachelor,1,2002,Alex Michel,25,Alex Michel,BA_1_ALEX_M_L
1,Bachelor,2,2002,Aaron Buerge,25,Aaron Buerge,BA_2_AARON_B_L
2,Bachelor,3,2003,Andrew Firestone,25,Andrew Firestone,BA_3_ANDREW_F_L
3,Bachelor,4,2003,Bob Guiney,25,Bob Guiney,BA_4_BOB_G_L
4,Bachelor,5,2004,Jesse Palmer,25,Jesse Palmer,BA_5_JESSE_P_L


In [24]:
# stack this on top of existing dataset
df_leads["lead_flag"] = True
df_all_flags["lead_flag"] = False
df_master = pd.concat([df_leads, df_all_flags])
df_master.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,e8,e9,lead,lead_flag,name,num_contestants,poc_flag,season,show,year
0,BA_1_ALEX_M_L,,,,,,,,,,...,,,Alex Michel,True,Alex Michel,25,,1,Bachelor,2002
1,BA_2_AARON_B_L,,,,,,,,,,...,,,Aaron Buerge,True,Aaron Buerge,25,,2,Bachelor,2002
2,BA_3_ANDREW_F_L,,,,,,,,,,...,,,Andrew Firestone,True,Andrew Firestone,25,,3,Bachelor,2003
3,BA_4_BOB_G_L,,,,,,,,,,...,,,Bob Guiney,True,Bob Guiney,25,,4,Bachelor,2003
4,BA_5_JESSE_P_L,,,,,,,,,,...,,,Jesse Palmer,True,Jesse Palmer,25,,5,Bachelor,2004


In [25]:
# verify uniqueness of cid
num_uniq_cid = len(df_master.cid.unique())
num_rows = len(df_master.index)
print("num unique cid:", num_uniq_cid, "| num rows:", num_rows)

num unique cid: 980 | num rows: 980


In [26]:
df_master = df_master.set_index("cid")
df_master.season = df_master.season.map(int)
df_master = df_master.sort_values(by=["show", "season", "lead_flag"], ascending=[True, True, False], axis=0)
df_master.head()

Unnamed: 0_level_0,d1,d10,d2,d3,d4,d5,d6,d7,d8,d9,...,e8,e9,lead,lead_flag,name,num_contestants,poc_flag,season,show,year
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BA_1_ALEX_M_L,,,,,,,,,,,...,,,Alex Michel,True,Alex Michel,25,,1,Bachelor,2002
BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,,...,,,Alex Michel,False,Amanda M,25,,1,Bachelor,2002
BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,,...,,,Alex Michel,False,Trista R,25,,1,Bachelor,2002
BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,,...,,,Alex Michel,False,Shannon O,25,,1,Bachelor,2002
BA_01_KIM_X,,,D5,D4,D1,,,,,,...,,,Alex Michel,False,Kim X,25,,1,Bachelor,2002


In [27]:
# save it all for later use!
master_output = os.path.join(intermed_dir, "master_dataset.csv")
df_master.to_csv(master_output)