# Applying Filters

In [2]:
import pandas as pd
import numpy as np
import os

## Step 1: Inspect the Data


In [3]:
filename = os.path.join(os.getcwd(), "data", "censusData.csv")
df = pd.read_csv(filename)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
0,36,State-gov,112074,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Non-Female,0,0,45,United-States,<=50K
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K


In [5]:
df.shape

(7000, 15)

## Step 2: Random Sampling of the Data
Random sampling from the data using `np.random.choice` and `loc`

In [6]:
# sampling using numpy
percentage = 0.3
num_rows = df.shape[0]
indices = np.random.choice(df.index, size=int(percentage*num_rows), replace=False)

df_subset = df.loc[indices]

# sampling using pandas
percentage = 0.3
num_rows = df.shape[0]

df_subset = df.sample(int(percentage*num_rows))
df_subset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
1021,29,Private,124680,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,13550,0,35,United-States,>50K
4834,31,Private,265706,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Non-Female,7298,0,40,United-States,>50K
2259,20,,24395,Some-college,10,Never-married,,Unmarried,White,Female,0,0,20,United-States,<=50K
2520,43,Private,40024,11th,7,Never-married,Transport-moving,Not-in-family,White,Non-Female,0,0,42,United-States,<=50K
507,40,Private,193882,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Non-Female,0,0,45,United-States,>50K


## Step 3:  Filter a DataFrame by Column Values

In [7]:
condition = df['workclass'] =='Private'
condition

0       False
1        True
2        True
3        True
4        True
        ...  
6995     True
6996     True
6997    False
6998     True
6999     True
Name: workclass, Length: 7000, dtype: bool

In [8]:
df_private = df[condition]
df_private.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K
5,27,Private,233421,Some-college,10,Never-married,Adm-clerical,Own-child,White,Non-Female,0,0,20,United-States,<=50K


## Step 4. Data Analysis using Filtering

In [9]:
condition = df['sex_selfID']=='Female'
df[condition]['age'].mean()

36.764213309828115

In [10]:
condition1 = df['workclass'] == 'Local-gov'
condition2 = df['hours-per-week'] > 40
condition = condition1 & condition2

df_local = df[condition]
rows = df_local.shape[0]
rows

126