# Data preprocessing
Load data, choose year or range of year, and standardize responses. The output will be used for weighted and feature engineering.

In [23]:
# import packages
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
# load data
data = pd.read_csv('../data/raw/Somerville_Happiness_Survey_Responses.csv')

In [25]:
# Choose a year or range of years to look at
print("Total responses: {}".format(len(data)))
print("Responses per year: {}".format(data.groupby('Year').count()[['Combined_ID']]))

Total responses: 8886
Responses per year:       Combined_ID
Year             
2011         6167
2013          193
2015          185
2017          845
2019         1496


Year counts are very inconsistent. A deeper look shows that in 2011 only a few questions were asked. That basically leaves us with 2019 for data analyis.

In [26]:
# strip down to 2019 only
data_2019 = data[data['Year']==2019]

Handle nulls. Eliminate any questions that were not asked in 2019. Replace null placeholders with np.nan.

In [27]:
# drop null columns. These are questions that were not asked in this yeaer
data_2019 = data_2019.dropna(axis=1)

# replace any 999.0 / 990 / '999' with null (declined to answer)
data_2019 = data_2019.replace(999.0, np.nan)
data_2019 = data_2019.replace(999, np.nan)
data_2019 = data_2019.replace('999', np.nan)

# rename columns for easier usage
col_map = {'Combined_ID': 'id',
           'Year': 'year',
           'How.happy.do.you.feel.right.now': 'q01_happy',
           'How.satisfied.are.you.with.your.life.in.general': 'q02_satisfied_general',
           'How.satisfied.are.you.with.Somerville.as.a.place.to.live': 'q03_satisfied_somerville',
           'How.satisfied.are.you.with.your.neighborhood': 'q04_satisfied_neighborhood',
           'Do.you.feel.the.City.is.headed.in.the.right.direction.or.is.it.on.the.wrong.track': 'q05_city_direction',
           'How.would.you.rate.the.following..The.availability.of.information.about.city.services': 'q06a_city_services',
           'How.would.you.rate.the.following..The.cost.of.housing': 'q06b_cost_housing',
           'How.would.you.rate.the.following..The.overall.quality.of.public.schools': 'q06c_quality_schools',
           'How.would.you.rate.the.following..Your.trust.in.the.local.police': 'q06d_trust_police',
           'How.would.you.rate.the.following..The.maintenance.of.streets.and.sidewalks': 'q06e_sidewalks',
           'How.would.you.rate.the.following..The.availability.of.social.community.events': 'q06f_events',
           'How.safe.do.you.feel.crossing.a.busy.street.in.Somerville': 'q07_safe_crossing_street',
           'How.convenient.is.it.for.you.to.get.where.you.want.to.go': 'q08_convenient',
           'How.safe.do.you.feel.walking.in.your.neighborhood.at.night': 'q09_safe_at_night',
           'How.satisfied.are.you.with.the.appearance.of.parks.and.squares.in.your.neighborhood': 'q10_parks',
           'How.satisfied.are.you.with.the.beauty.or.physical.setting.of.your.neighborhood': 'q11_beauty',
           'How.satisfied.are.you.with.the.condition.of.your.housing': 'q12_housing_condition',
           'What.is.your.gender': 'd01_gender',
           'Age': 'd02_age',
           'Language': 'd03_language',
          'What.is.your.race.or.ethnicity': 'd04_race',
          'Do.you.have.children.age.18.or.younger.who.live.with.you': 'd05_num_children',
          'Describe.your.housing.status.in.Somerville': 'd06_housing_status',
          'Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years': 'd07_plan_to_move',
          'What.is.your.annual.household.income': 'd08_hhi',
          'Are.you.a.student': 'd09_is_student',
           'How.long.have.you.lived.here': 'd10_how_long_lived_here',
          'Ward': 'ward',
          'Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years.yes.why': 'move_why',
          'What.is.your.primary.mode.of.transportation': 'd11_transportation_mode',
          'Which.of.the.following.have.you.used.in.the.past.month.to.get.around': 'd12_transportation_month',
          }

# map column names 
data_2019.columns = data_2019.columns.map(col_map)

## Variable transformations
1. Categorize ambigious or unique answers
2. Map HHI, race, ethnicity, gender, age to hooks for population balancing
3. Break out compound responses into indicators.

We will retain the un-transformed variables for later feature engineering.

In [28]:
# Gender

def map_gender(v):
    """Define gender categories. There is an open entry option, which we're mapping to non-binary."""
    if v not in ('Female', 'Male', 'No Answer'):
        return 'Nonbinary'
    else:
        return v
    
# map gender
data_2019['d01_gender'] = data_2019['d01_gender'].fillna('No Answer')    
data_2019['d01_gender'] = data_2019['d01_gender'].apply(lambda x: map_gender(x))

In [29]:
# Years lived in Somerville. This one is complicated since it was not a coerced data type.

def format_years(v):
    """Map response to 'how long have you lived here?'"""

    # just a number, assume it's years
    p = '^([\.\d]+)$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))
    
    # "3.5 years"
    p = '^([\.\d]+) years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))

    # "30+ years"
    p = '^([\.\d]+)\+ years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]))
    
    # "9 months"
    p = '^([\.\d]+) months$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0]) / 12)
    
    # "3 weeks"
    p = '^([\.\d]+) weeks$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return 0
    
    # "6-8 years"
    p = '^([\.\d]+)-([\.\d]+) years$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0][1]))
    
    # "1 year, 9 months"
    p = '^([\.\d]+) year, ([\.\d]+) months$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return int(float(r[0][0]))
    
    # "3 days"
    p = '^([\.\d]+) days$'
    r = re.findall(p, v)
    
    if len(r) > 0:
        return 0
    
    if v == '1 year':
        return 1
    
    if v == '1 month':
        return 0
    
    years_map = {'no_answer': np.nan,
             '1 month, two years before from 2010-2012': 2,
             '4  months': 0,
             '5 years + 10 years before': 15,
             '`17 years': 17,
             'less than 1 year': 0,
             '11years': 11,
             '3 generations': np.nan,
             '1 year thist ime; 6 years total': 6,
             '1 year 2 months': 1,
             'too long': np.nan,
             '50 +': 50,
             ' years': np.nan,
             '16+': 16,
             '14 years; 11 years now': 14,
             '1 year 3 months': 1,
             'less than 10 months': 0,
             '30+': 30,
             '8.5 yesrs': 8.5,
             'life long resident': 50,
             '1 year this time; 6 years total': 6
            }
    
    if v in years_map.keys():
        return years_map[v]

# Make sure everything is a string
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].astype(str)

# For some reason '-' was coded as '999'. We need to undo that.
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].str.replace("999", '-')

# Lowercase
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].str.lower()

# remove qualifiers
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("almost ") else x)
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("about ") else x)
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].apply(
    lambda x: x[7:] if x.startswith("over ") else x)

# apply formatting function
data_2019['d10_how_long_lived_here'] = data_2019['d10_how_long_lived_here'].apply(lambda x: format_years(x))

# We retain the original column for later.

In [33]:
# Housing status

def format_housing_status(v):
    """Map housing status response"""
    
    if v == 'Rent':
        return v
    
    elif v == 'Own':
        return v
    
    else:
        return 'Other'

# Map housing status. We just want Rent/Own/Other
data_2019['d06_housing_status'] = data_2019['d06_housing_status'].apply(lambda x: format_housing_status(x))

In [34]:
# Is the city moving in the right direction?

city_direction_map = {'Right direction': 'right',
                      'Wrong track': 'wrong',
                      'Not sure': 'unsure',}

data_2019['q05_city_direction'] = data_2019['q05_city_direction'].map(city_direction_map)
data_2019['q05_city_direction'].fillna('no_answer', inplace=True)


In [35]:
# Language. This is tricky because there's a free response. I mapped most to "other"

data_2019['d03_language'] = data_2019['d03_language'].str.lower()

data_2019['d03_english'] = pd.to_numeric(data_2019['d03_language'].str.contains('english') * 1)
data_2019['d03_spanish'] = pd.to_numeric(data_2019['d03_language'].str.contains('spanish') * 1)
data_2019['d03_portuguese'] = pd.to_numeric(data_2019['d03_language'].str.contains('portuguese') * 1)

searchfor = ['mandarin', 'chinese', 'contonese']
data_2019['d03_chinese'] = pd.to_numeric(data_2019['d03_language'].str.contains('|'.join(searchfor)) * 1)

searchfor = ['arabic', 'greek', 'french', 'punjabi', 'amharic', 
             'gujrati', 'nepali', 'tigrinya', 'polish', 'filipino',
             'italian', 'japanese', 'russian', 'haitian creole', 'kreole', 'romanian',
             'slovac', 'vietnnamese', 'esperanto', 'hebrew', 'bulgarian', 'latin',
            'persian', 'romanian']
data_2019['d03_other'] = pd.to_numeric(data_2019['d03_language'].str.contains('|'.join(searchfor)) * 1)

data_2019.drop('d03_language', axis=1, inplace=True)

In [46]:
# Create an enthicity column with Hispanic / not Hispanic

searchfor = ['hispanic', 'puerto rican']
data_2019['d04_eth_hispanic'] = pd.to_numeric(data_2019['d04_race'].str.contains('|'.join(searchfor)) * 1)
    
# ethnicity hook for population balancing
data_2019['eth_hooks'] = data_2019.apply(
    lambda row: 'eth_hispanic' if row['d04_eth_hispanic'] >= 1 else 'eth_not_hispanic', axis=1)


    
# Race is also used in balancing hooks. People can have multiple responses and we want to keep track of "more than one race".
# So as an intermediate step we break out indicators for each race in (white, aa, asian, other)

# format string
data_2019['d04_race'] = data_2019['d04_race'].str.lower()


# Break out white, black, asian
data_2019['d04_race_white'] = pd.to_numeric(data_2019['d04_race'].str.contains('white') * 1)
data_2019['d04_race_aa'] = pd.to_numeric(data_2019['d04_race'].str.contains('black') * 1)
data_2019['d04_race_asian'] = pd.to_numeric(data_2019['d04_race'].str.contains('asian') * 1)

# Map everything else to other
searchfor = ['jewish', 'american indian', 'portuguese', 'cape verdean', 
             'middle eastern', 'east indian', 'biracial', 'arab', 'brazilian']
data_2019['d04_race_other'] = pd.to_numeric(data_2019['d04_race'].str.contains('|'.join(searchfor)) * 1)

# We need to define a column that maps to population hooks.
def get_race_hooks(row):
    race_cols = ['d04_race_aa', 'd04_race_asian', 'd04_race_white', 'd04_race_other']
    if sum(row[race_cols]) > 1:
        return 'race_two_or_more'
    if row['d04_race_aa'] > 0:
        return 'race_aa'
    if row['d04_race_asian'] > 0:
        return 'race_asian'
    if row['d04_race_other'] > 0:
        return 'race_other'
    if row['d04_race_white'] > 0:
        return 'race_white'
    else:
        return 'No Answer'
    
data_2019['race_hooks'] = data_2019.apply(lambda row: get_race_hooks(row), axis=1)


# we retain the original column.


In [47]:
# Household income. Map to buckets for population balancing.

hhi_map = {'Less than $10,000': 1,
           '$10,000 to $24,999': 2,
           '$25,000 to $49,999': 3,
           '$50,000 to 74,999': 4,
           '$75,000 to $99,999': 5,
           '$100,000 to $149,999': 6,
           '$150,000 to 200,000': 7,
           '$200,000 or more': 8}

data_2019['d08_hhi_buckets'] = data_2019['d08_hhi'].map(hhi_map)

# We retain the original column


In [None]:
# Convert to binary datatype for children, plan to move, is student

data_2019['d05_num_children'] = pd.to_numeric(data_2019['d05_num_children'].map({'Yes': 1, 'No': 0}))
data_2019['d07_plan_to_move'] = pd.to_numeric(data_2019['d07_plan_to_move'].map({'Yes': 1, 'No': 0}))
data_2019['d09_is_student'] = pd.to_numeric(data_2019['d09_is_student'].map({'Yes': 1, 'No': 0}))

In [48]:
# Age also gets mapped to buckets for population balancing, but we want to retain the original column as well

def map_age(v):
    """Define age buckets"""
    if v == 17:
        return "17 Years"
    elif v <= 24:
        return "18 to 24 Years"
    elif v <= 34:
        return "25 to 34 Years"
    elif v <= 44:
        return "35 to 44 Years"
    elif v <= 54:
        return "45 to 54 Years"
    elif v <= 64:
        return "55 to 64 Years"
    elif v <= 74:
        return "65 to 74 Years"
    else:
        return "75 Years & Over"
    


# map age since we need it for balancing
data_2019['d02_age'] = pd.to_numeric(data_2019['d02_age'])
data_2019['d02_age'] = data_2019['d02_age'].apply(lambda x: map_age(x))
data_2019['d02_age'] = data_2019['d02_age'].fillna('No Answer')    

In [None]:
# Transportation.
# These can have compound answers that we need to break out into indicators.

# the transportation questions return a list of vehicles. Break into indicator columns.
data_2019['d11_car'] = pd.to_numeric(data_2019['d11_transportation_mode'].str.contains('Car') * 1)
data_2019['d11_walk'] = pd.to_numeric(data_2019['d11_transportation_mode'].str.contains('Walk') * 1)
data_2019['d11_bike'] = pd.to_numeric(data_2019['d11_transportation_mode'].str.contains('Bike') * 1)
data_2019['d11_public'] = pd.to_numeric(data_2019['d11_transportation_mode'].str.contains('Public') * 1)

data_2019.drop('d11_transportation_mode', axis=1, inplace=True)

data_2019['d12_car'] = pd.to_numeric(data_2019['d12_transportation_month'].str.contains('Car') * 1)
data_2019['d12_walk'] = pd.to_numeric(data_2019['d12_transportation_month'].str.contains('Walk') * 1)
data_2019['d12_bike'] = pd.to_numeric(data_2019['d12_transportation_month'].str.contains('Bike') * 1)
data_2019['d12_public'] = pd.to_numeric(data_2019['d12_transportation_month'].str.contains('Public') * 1)

data_2019.drop('d12_transportation_month', axis=1, inplace=True)



In [197]:
# export data

data_2019.set_index('id', inplace=True)

data_2019.to_csv('../data/processed/data_2019_preprocessed.csv', index=False)

### Population balancing

For generalized survey balancing we need to generate a vector X with one row per person, and binary indicators for each group of interest

In [230]:
# start by getting the basic demo info
demo_cols = ['gender', 'd08_hhi', 'race', 'eth', 'age']
demo_data = data_2019[demo_cols]

demo_data.rename(columns={'d08_hhi': 'hhi'}, inplace=True)
# impute all columns. If someone didn't answer, randomly draw from the options at the sampled freqencies.


# demo_data['race_eth'] = demo_data.apply(lambda row: "({0}, {1})".format(row['race'], row['eth']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data.rename(columns={'d08_hhi': 'hhi'}, inplace=True)


In [231]:
demo_data['gender'] = demo_data['gender'].replace('No Answer', np.nan)
demo_data['race'] = demo_data['race'].replace('No Answer', np.nan)
demo_data['eth'] = demo_data['eth'].replace('No Answer', np.nan)
demo_data['hhi'] = demo_data['hhi'].replace('No Answer', np.nan)
demo_data['age'] = demo_data['age'].replace('No Answer', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['gender'] = demo_data['gender'].replace('No Answer', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['race'] = demo_data['race'].replace('No Answer', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['eth'] = demo_data['eth'].replace('No Answer', np.nan)
A 

In [232]:

def impute_by_sampled_frequency(df, col):
    """Modifies the df in place"""
    s = df[col].value_counts(normalize=True)
    missing = df[col].isnull()
    df.loc[missing, col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)

    return 

In [233]:
# for any missing value, substitute according to the sample frequencies
impute_by_sampled_frequency(demo_data, 'gender')
impute_by_sampled_frequency(demo_data, 'hhi')
impute_by_sampled_frequency(demo_data, 'race')
impute_by_sampled_frequency(demo_data, 'eth')
impute_by_sampled_frequency(demo_data, 'age')

In [234]:
# define race / ethnicity combinations
demo_data['race_eth'] = demo_data.apply(lambda row: "({0}, {1})".format(row['race'], row['eth']), axis=1)
demo_data.drop(['race', 'eth'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['race_eth'] = demo_data.apply(lambda row: "({0}, {1})".format(row['race'], row['eth']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data.drop(['race', 'eth'], axis=1, inplace=True)


In [235]:
demo_data.head()

Unnamed: 0_level_0,gender,hhi,age,race_eth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7391,Female,3.0,65 to 74 Years,"(race_white, eth_not_hispanic)"
7392,Female,2.0,55 to 64 Years,"(race_white, eth_not_hispanic)"
7393,Male,4.0,65 to 74 Years,"(race_white, eth_not_hispanic)"
7394,Male,5.0,55 to 64 Years,"(race_white, eth_not_hispanic)"
7395,Female,5.0,55 to 64 Years,"(race_asian, eth_not_hispanic)"


In [236]:
# rename values to match the population counts
demo_data['gender'] = demo_data['gender'].apply(lambda x: "gender_{}".format(x))
demo_data['hhi'] = demo_data['hhi'].apply(lambda x: "hhi_{}".format(str(int(x))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['gender'] = demo_data['gender'].apply(lambda x: "gender_{}".format(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['hhi'] = demo_data['hhi'].apply(lambda x: "hhi_{}".format(str(int(x))))


Now we can get the output matrix

In [243]:
respondant_matrix = pd.get_dummies(demo_data, prefix="", prefix_sep="")

We still have to 

In [144]:
population_shares = pd.read_csv('../data/processed/population_weights.csv')

In [147]:
data_2019 = data_2019.merge(population_shares, how='left', on=['gender', 'race', 'eth', 'age'])
# make sure we've joined all of the columns
data_2019[data_2019['population_share'].isnull()][cols]

In [153]:
# get survey shares
survey_shares = data_2019.groupby(['gender', 'race', 'eth', 'age']).count()[['q01_happy']] / len(data_2019)
survey_shares.rename(columns={'q01_happy': 'survey_share'}, inplace=True)


In [154]:
data_2019 = data_2019.merge(survey_shares, how='left', on=['gender', 'race', 'eth', 'age'])
data_2019[data_2019['survey_share'].isnull()][cols]

In [156]:
data_2019['weight'] = data_2019['population_share'] / data_2019['survey_share']

This doesn't work for people who didn't answer!

In [170]:
cols = ['age', 'gender', 'eth','race', 'population_share', 'survey_share', 'weight']
data_2019.sort_values(by='weight').tail(10)[cols]


Unnamed: 0,age,gender,eth,race,population_share,survey_share,weight
1388,18 to 24 Years,Male,eth_not_hispanic,No Answer,0.063132,0.000661,95.455691
1216,18 to 24 Years,Female,eth_not_hispanic,No Answer,0.063132,0.000661,95.455691
1028,55 to 64 Years,No Answer,eth_not_hispanic,race_white,0.065927,0.000661,99.681656
125,45 to 54 Years,No Answer,eth_not_hispanic,race_white,0.067426,0.000661,101.947474
246,55 to 64 Years,No Answer,eth_not_hispanic,No Answer,0.084717,0.000661,128.092454
284,18 to 24 Years,No Answer,eth_not_hispanic,race_white,0.109177,0.000661,165.074887
290,35 to 44 Years,No Answer,eth_not_hispanic,race_white,0.1104,0.000661,166.924827
879,25 to 34 Years,No Answer,eth_not_hispanic,race_white,0.255077,0.001323,192.838329
496,25 to 34 Years,No Answer,eth_not_hispanic,race_white,0.255077,0.001323,192.838329
1270,35 to 44 Years,No Answer,eth_not_hispanic,No Answer,0.141866,0.000661,214.500959


In [108]:
population_shares[(population_shares['race']=='race_white') & (population_shares['eth']=='eth_not_hispanic')]

Unnamed: 0,gender,race,eth,age,population_share
0,Male,race_white,eth_not_hispanic,18 to 24 Years,0.049129
1,Female,race_white,eth_not_hispanic,18 to 24 Years,0.049129
2,Nonbinary,race_white,eth_not_hispanic,18 to 24 Years,0.010918
3,Male,race_white,eth_not_hispanic,25 to 34 Years,0.114785
4,Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785
5,Nonbinary,race_white,eth_not_hispanic,25 to 34 Years,0.025508
6,Male,race_white,eth_not_hispanic,35 to 44 Years,0.04968
7,Female,race_white,eth_not_hispanic,35 to 44 Years,0.04968
8,Nonbinary,race_white,eth_not_hispanic,35 to 44 Years,0.01104
9,Male,race_white,eth_not_hispanic,45 to 54 Years,0.030342


In [102]:
cols = ['gender', 'race', 'eth', 'age', 'population_share']
test[cols]

Unnamed: 0,gender,race,eth,age,population_share
0,Female,race_white,eth_not_hispanic,65 to 74 Years,0.017806
1,Female,race_white,eth_not_hispanic,56 to 64 Years,
2,Male,race_white,eth_not_hispanic,65 to 74 Years,0.017806
3,Male,race_white,eth_not_hispanic,56 to 64 Years,
4,Female,race_asian,eth_not_hispanic,56 to 64 Years,
...,...,...,...,...,...
1503,Female,race_white,eth_not_hispanic,45 to 55 Years,
1504,Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785
1505,Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785
1506,Female,race_two_or_more,eth_not_hispanic,25 to 34 Years,0.004404


In [None]:
# create hooks for population balancing.


In [8]:
# create variable file
variable_list = pd.DataFrame({'var': data_2019.columns, 'is_feature': None, 'is_target': None})


In [9]:
variable_list.to_csv('../data/processed/data_2019_vars.csv', index=False)

In [10]:
variable_list

Unnamed: 0,var,is_feature,is_target
0,q01_happy,,
1,q02_satisfied_general,,
2,q03_satisfied_somerville,,
3,q04_satisfied_neighborhood,,
4,q06a_city_services,,
5,q06b_cost_housing,,
6,q06c_quality_schools,,
7,q06d_trust_police,,
8,q06e_sidewalks,,
9,q06f_events,,
