# Building a Balanced Data Set

In [60]:
import pandas as pd
import numpy as np
import os

## Step 1: Inspect the Data


In [None]:
filename = os.path.join(os.getcwd(), "data", "censusData.csv")
df = pd.read_csv(filename)

In [62]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
0,36,State-gov,112074,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Non-Female,0,0,45,United-States,<=50K
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K


In [63]:
df.shape

(7000, 15)

## Step 2: Random Sampling of the Data
Random sampling from the data using `np.random.choice` and `loc`

In [64]:
percentage = 0.3
num_rows = df.shape[0]

# YOUR CODE HERE
df_subset = df.sample(int(percentage*num_rows))



## Step 3: Verifying Imbalance
Is our sample *balanced* with respect to (self-reported) sex? In order to answer that, first we'd like to know how many categories exist for the 'sex_selfID' values in our data.

In [65]:
unique_ssID = df['sex_selfID'].unique()
unique_ssID

array(['Non-Female', 'Female'], dtype=object)

### Calculating the Proportion of Each Class
How many 'Female' examples are in our data sample?

The code cell below uses `np.sum()` to sum up  the `True` values that indicate whether a row has `Female` in the `sex_selfID` field. It divides that sum by the total number of rows in the DataFrame `df_subset`. Run the code to display the results. Note that the sample is not balanced with respect to self-reported sex (assuming that we want balance for the two classes).

In [66]:
counts = df_subset['sex_selfID'].value_counts()
counts

Unnamed: 0_level_0,count
sex_selfID,Unnamed: 1_level_1
Non-Female,1426
Female,674


In [67]:
counts['Female']/sum(counts.values)

np.float64(0.32095238095238093)

In [68]:
df_subset.groupby(['sex_selfID', 'income']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
sex_selfID,income,Unnamed: 2_level_1
Female,<=50K,602
Female,>50K,72
Non-Female,<=50K,996
Non-Female,>50K,430


### Addressing imbalance: upsampling the underrepresented group.

In [69]:
low_income_nonfemale, high_income_nonfemale = df_subset.groupby(['sex_selfID', 'income']).size()['Non-Female']
class_balance_nonfemale = high_income_nonfemale / low_income_nonfemale

low_income_female, high_income_female = df_subset.groupby(['sex_selfID', 'income']).size()['Female']

add_sample_size = int(class_balance_nonfemale*low_income_female - high_income_female)
add_sample_size # we need this many more points in (Female)&(>50K) group for balance

187

In [70]:
# Subset the original data: exclude entries that are already in our sample:
df_never_sampled = df.drop(labels=df_subset.index, axis=0, inplace=False)

# Filter that subset to include only the type of examples that we want to upsample: Females, higher income
condition = (df_never_sampled['income']=='>50K') & (df_never_sampled['sex_selfID']=='Female')
df_never_sampled_target = df_never_sampled[condition]

# Sample from the resulting set
size=min(add_sample_size, df_never_sampled_target.shape[0])
indices = np.random.choice(df_never_sampled_target.index, size=size, replace=False)

# Append the selected examples to our original sample
rows = df.loc[indices]
df_balanced_subset = pd.concat([df_subset, rows], ignore_index=True)
df_balanced_subset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
0,31,Private,173858,Prof-school,15,Married-civ-spouse,Tech-support,Husband,Asian-Pac-Islander,Non-Female,0,0,40,India,<=50K
1,32,Private,107843,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Non-Female,5178,0,50,United-States,>50K
2,52,Local-gov,278522,Some-college,10,Married-civ-spouse,Transport-moving,Husband,Black,Non-Female,0,0,40,United-States,<=50K
3,46,Private,197332,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,7688,0,46,United-States,>50K
4,51,Private,189511,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Non-Female,0,0,50,Germany,>50K


In [71]:
df_balanced_subset.groupby(['sex_selfID', 'income']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
sex_selfID,income,Unnamed: 2_level_1
Female,<=50K,602
Female,>50K,253
Non-Female,<=50K,996
Non-Female,>50K,430


The resulting balance is not perfect, but it is better than before!