# The Bachelor & Race: Clean Data (Pt. 3)
* **Filename**: clean_weeks_data.ipynb
* **Author**: Angelina Li
* **Date**: 09/07/2018
* **Description**: Update master_flags_dataset to contain information on how many weeks each contestant lasted.
* **Input**: master_flags_dataset.csv
* **Output**: Updated master_flags_dataset.csv

In [1]:
import re
import pandas as pd
import os

In [2]:
# name key directories

input_dir = "../input"
intermed_dir = "../intermediate"

In [3]:
# import the master dataset
master_path = os.path.join(intermed_dir, "master_flags_dataset.csv")
df = pd.read_csv(master_path)
print(df.columns)
df.head()

Index(['cid', 'd1', 'd10', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
       'datetime', 'e1', 'e10', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9',
       'lead', 'lead_flag', 'name', 'num_contestants', 'poc', 'season', 'show',
       'year', 'white', 'afam', 'amin', 'hisp', 'asn_paci', 'oth', 'mult',
       'race_data_flag', 'season_comp_flag', 'lead_poc'],
      dtype='object')


Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,white,afam,amin,hisp,asn_paci,oth,mult,race_data_flag,season_comp_flag,lead_poc
0,BA_1_ALEX_M_L,,,,,,,,,,...,1.0,,,,,,,True,True,False
1,BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,...,1.0,,,,,,,True,True,False
2,BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,...,1.0,,,,,,,True,True,False
3,BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,...,1.0,,,,,,,True,True,False
4,BA_01_KIM_X,,,D5,D4,D1,,,,,...,1.0,,,,,,,True,True,False


In [4]:
df[["e{}".format(week) for week in range(1, 11)]].head()

Unnamed: 0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10
0,,,,,,,,,,
1,,,,,,W,,,,
2,,,,,,E,,,,
3,,,,,E,,,,,
4,,,,E,,,,,,


In [5]:
# create a var equalling the total number of weeks per season

def get_season_num_weeks(show):
    seasons = df[df["show"] == show].season.unique()
    season_num_weeks = {}
    for season in seasons:
        season_df = df[(df.season == season) & (df.show == show)]
        show_end = 10
        for week in range(10, 0, -1):
            vals = season_df["e{}".format(week)].unique()
            show_end = week
            if len(vals) > 1:
                break                
        season_num_weeks[(show, season)] = show_end
    return season_num_weeks

season_nums = get_season_num_weeks("Bachelorette")
season_nums.update(get_season_num_weeks("Bachelor"))
print(season_nums)

{('Bachelorette', 1): 6, ('Bachelorette', 2): 7, ('Bachelorette', 3): 8, ('Bachelorette', 4): 8, ('Bachelorette', 5): 10, ('Bachelorette', 6): 10, ('Bachelorette', 7): 10, ('Bachelorette', 8): 10, ('Bachelorette', 9): 10, ('Bachelorette', 10): 10, ('Bachelorette', 11): 9, ('Bachelorette', 12): 10, ('Bachelorette', 13): 10, ('Bachelorette', 14): 10, ('Bachelor', 1): 6, ('Bachelor', 2): 7, ('Bachelor', 3): 7, ('Bachelor', 4): 7, ('Bachelor', 5): 7, ('Bachelor', 6): 8, ('Bachelor', 7): 7, ('Bachelor', 8): 7, ('Bachelor', 9): 7, ('Bachelor', 10): 8, ('Bachelor', 11): 8, ('Bachelor', 12): 8, ('Bachelor', 13): 8, ('Bachelor', 14): 8, ('Bachelor', 15): 10, ('Bachelor', 16): 10, ('Bachelor', 17): 10, ('Bachelor', 18): 10, ('Bachelor', 19): 10, ('Bachelor', 20): 10, ('Bachelor', 21): 10, ('Bachelor', 22): 10}


In [6]:
def get_num_weeks(row):
    return season_nums.get((row["show"], row["season"]))

df["season_num_weeks"] = df.apply(get_num_weeks, axis=1)
df.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,afam,amin,hisp,asn_paci,oth,mult,race_data_flag,season_comp_flag,lead_poc,season_num_weeks
0,BA_1_ALEX_M_L,,,,,,,,,,...,,,,,,,True,True,False,6
1,BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,...,,,,,,,True,True,False,6
2,BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,...,,,,,,,True,True,False,6
3,BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,...,,,,,,,True,True,False,6
4,BA_01_KIM_X,,,D5,D4,D1,,,,,...,,,,,,,True,True,False,6


In [7]:
df.season_num_weeks.unique()

array([ 6,  7,  8, 10,  9])

In [8]:
# get number of weeks that each candidate spent on the show
def get_cont_num_weeks(row):
    if row["lead_flag"]:
        return None
    weeks = row[["e{}".format(week) for week in range(1, 11)]]
    
    for i in range(row["season_num_weeks"] - 1, -1, -1):
        val = weeks[i]
        week = i + 1
        if not pd.isnull(val):
            return week

df["num_weeks"] = df.apply(get_cont_num_weeks, axis=1)
df.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,amin,hisp,asn_paci,oth,mult,race_data_flag,season_comp_flag,lead_poc,season_num_weeks,num_weeks
0,BA_1_ALEX_M_L,,,,,,,,,,...,,,,,,True,True,False,6,
1,BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,...,,,,,,True,True,False,6,6.0
2,BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,...,,,,,,True,True,False,6,6.0
3,BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,...,,,,,,True,True,False,6,5.0
4,BA_01_KIM_X,,,D5,D4,D1,,,,,...,,,,,,True,True,False,6,4.0


In [9]:
df["num_weeks"].unique()

array([ nan,   6.,   5.,   4.,   3.,   2.,   1.,   7.,   8.,  10.,   9.])

In [10]:
def get_perc_weeks(row):
    return row["num_weeks"] / row["season_num_weeks"]

df["perc_weeks"] = df.apply(get_perc_weeks, axis=1)
df.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,hisp,asn_paci,oth,mult,race_data_flag,season_comp_flag,lead_poc,season_num_weeks,num_weeks,perc_weeks
0,BA_1_ALEX_M_L,,,,,,,,,,...,,,,,True,True,False,6,,
1,BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,...,,,,,True,True,False,6,6.0,1.0
2,BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,...,,,,,True,True,False,6,6.0,1.0
3,BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,...,,,,,True,True,False,6,5.0,0.833333
4,BA_01_KIM_X,,,D5,D4,D1,,,,,...,,,,,True,True,False,6,4.0,0.666667


In [11]:
def is_winner(row):
    if row["lead_flag"]:
        return
    weeks = row[["e{}".format(week) for week in range(1, 11)]].tolist()
    return "W" in weeks

df["winner_flag"] = df.apply(is_winner, axis=1)
df.head()

Unnamed: 0,cid,d1,d10,d2,d3,d4,d5,d6,d7,d8,...,asn_paci,oth,mult,race_data_flag,season_comp_flag,lead_poc,season_num_weeks,num_weeks,perc_weeks,winner_flag
0,BA_1_ALEX_M_L,,,,,,,,,,...,,,,True,True,False,6,,,
1,BA_01_AMANDA_M,,,D5,D1,D1,D1,D1,,,...,,,,True,True,False,6,6.0,1.0,True
2,BA_01_TRISTA_R,,,D5,D1,D1,D1,D1,,,...,,,,True,True,False,6,6.0,1.0,False
3,BA_01_SHANNON_O,,,D5,D1,D1,D1,,,,...,,,,True,True,False,6,5.0,0.833333,False
4,BA_01_KIM_X,,,D5,D4,D1,,,,,...,,,,True,True,False,6,4.0,0.666667,False


In [12]:
# create flag for first impression rose
df["fimp_rose"] = df.e1.map(lambda val: val == "R1")
df[df.fimp_rose == True][["cid", "name", "show", "season", "year", "e1", "poc"]].head()

Unnamed: 0,cid,name,show,season,year,e1,poc
110,BA_05_TRISH_S,Trish S,Bachelor,5,2004,R1,False
159,BA_07_Sarah W.,Sarah w.,Bachelor,7,2005,R1,False
166,BA_07_Kerry,Kerry,Bachelor,7,2005,R1,False
212,BA_09_LISA_B,Lisa B,Bachelor,9,2006,R1,False
244,BA_10_STEPHANIE_T,Stephanie T,Bachelor,10,2007,R1,False


In [13]:
df.columns

Index(['cid', 'd1', 'd10', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9',
       'datetime', 'e1', 'e10', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9',
       'lead', 'lead_flag', 'name', 'num_contestants', 'poc', 'season', 'show',
       'year', 'white', 'afam', 'amin', 'hisp', 'asn_paci', 'oth', 'mult',
       'race_data_flag', 'season_comp_flag', 'lead_poc', 'season_num_weeks',
       'num_weeks', 'perc_weeks', 'winner_flag', 'fimp_rose'],
      dtype='object')

In [14]:
# drop seasons we don't have race data for
df = df[df.season_comp_flag == True]
df = df[["cid", "name", "lead", "lead_flag", "num_contestants", 
         "poc", "season", "show", "year", "white", "afam", "amin", 
         "hisp", "asn_paci", "oth", "mult", 
         "num_weeks", "season_num_weeks", "perc_weeks",
         "winner_flag", "lead_poc", "fimp_rose"]]
df.head()

Unnamed: 0,cid,name,lead,lead_flag,num_contestants,poc,season,show,year,white,...,hisp,asn_paci,oth,mult,num_weeks,season_num_weeks,perc_weeks,winner_flag,lead_poc,fimp_rose
0,BA_1_ALEX_M_L,Alex Michel,Alex Michel,True,25,False,1,Bachelor,2002,1.0,...,,,,,,6,,,False,False
1,BA_01_AMANDA_M,Amanda M,Alex Michel,False,25,False,1,Bachelor,2002,1.0,...,,,,,6.0,6,1.0,True,False,False
2,BA_01_TRISTA_R,Trista R,Alex Michel,False,25,False,1,Bachelor,2002,1.0,...,,,,,6.0,6,1.0,False,False,False
3,BA_01_SHANNON_O,Shannon O,Alex Michel,False,25,False,1,Bachelor,2002,1.0,...,,,,,5.0,6,0.833333,False,False,False
4,BA_01_KIM_X,Kim X,Alex Michel,False,25,False,1,Bachelor,2002,1.0,...,,,,,4.0,6,0.666667,False,False,False


In [15]:
df_path = os.path.join("..", "master_flags_dataset.csv")
df.to_csv(df_path)