# Cleaning the Household data
Before further analysis the data was cleaned and PII (Personally Indentifiable Information) removed or obscured.

In [1]:
# imports
import pandas as pd

In [2]:
# load the data
households = pd.read_csv('data_raw_NOGIT/180613_households_district3_all_data.txt', sep='\t')
# and a cityarea look up file
cityareas = pd.read_csv('data_raw_NOGIT/cityareas_precinct.csv')

For the Household data the value_counts() for each column were reviewed and the following was done to clean and anonymize the data:

| Original Data Column | Description of action | output column(s) | Type |
|:---:|:---|:---:|:---:|
| 'Household_Id' | Rows of table were randomly shuffled, the index reset and the new index used as new UID. | 'hid' | Num |
| 'FullAddress' | Shown to be concatenation of 'HouseNumber','Street','StreetType', 'BuildingNumber & 'ApartmentNumber' in all but 4 cases where the Apt numbers appeared to be missing or include typo's and then dropped as PII.  | | |
| 'HouseNumber' | Dropped as PII. | | |
| 'HouseNumberSuffix' | Dropped empty. | | |
| 'StreetPrefix' |  Dropped empty. | | |
| 'Street' | Used to clean 'StreetType' and then Dropped as PII. | | |
| 'StreetType' | ‘CMN’ ‘GREEN’ => ‘GRN’ and two cross streets => ‘UKN’. | 'StreetType' | Cat |
| 'BuildingNumber' | Only 3 entries dropped. | | |
| 'ApartmentNumber' | Converted to a True/False field. | 'isApt' | Bool |
| 'City' | Dropped, all entries are the same. | | |
| 'State' | 6 missing rows, dropped as all should be the same. | | |
| 'Zip' | Cleaned all to 5 digit numerical zip code entries. | 'Zip' | Num |
| 'Precinct' | Converted to number and kept. | 'Precinct' | Num |
|  | The Precinct was also used to create a 'CityArea' column | 'CityArea' | Cat |
| 'PrecinctSub' | Converted to number and kept. | 'PrecinctSub' | Num |
| 'District' | Kept as is in case we need to add in other district data. | 'District' | Num |


In [3]:
h=households
# dropping cityarea so I can add it cleanly shortly
h = h.drop('CityArea', axis='columns')
print('Total number of households in data: {}'.format(len(h)))

Total number of households in data: 6930


### Cleaning StreetType
significant data is missing from this column, I used the 'Street' and 'Full address' to identify the missing categories 'GRN', 'UKN' and add them, I was also able to identify that COMMON and COMMONS had not been mapped correctly to 'CMN' and fixed that.

In [4]:
def get_full_address(row):
    
    build_num = row.BuildingNumber
    apt_num = row.ApartmentNumber
    if str(build_num) != 'nan':
        build_num = int(row.BuildingNumber)
        apt_num = '{}'.format(str(row.ApartmentNumber))
    elif str(row.ApartmentNumber) != 'nan':
        apt_num = '# {}'.format(str(row.ApartmentNumber))
        
    row['cc_full_add'] = ' '.join([x for x in [str(row.HouseNumber), row.Street,
                                 row.StreetType, str(build_num),
                                               apt_num] if str(x) != 'nan'])
    return row

In [5]:
h = h.apply(get_full_address, axis=1)

In [6]:
h[['FullAddress','cc_full_add']]
#assert (h.FullAddress == h.cc_full_add).all()
add_issues = h.loc[h['FullAddress'] != h['cc_full_add'], ['Household_Id','FullAddress','cc_full_add','BuildingNumber']]
print('All but {} address\'s follow the pattern.'.format(len(add_issues)))
add_issues

All but 4 address's follow the pattern.


Unnamed: 0,Household_Id,FullAddress,cc_full_add,BuildingNumber
1467,HH-17611,39590 WAINWRIGHT COMMON,39590 WAINWRIGHT COMMON # T,
4458,HH-20607,3740 BOSWELL TER,3740 BOSWELL TER # 3940,
5790,HH-21939,39600 FREMONT BLVD # 147,39600 FREMONT BLVD # 14-7,
5887,HH-22036,39800 FREMONT BLVD # 337,39800 FREMONT BLVD # 3307,


In [7]:
# Various views that helped understand the missing StreetType data.
# The Street names with null 'StreetType'
#h[h.StreetType.isnull()].loc[:,['Street', 'StreetType']].groupby('Street').count()
# most are 'COMMONS' in fact all Full Address' that have 'COMMON' in their name have StreetType = null
#h.loc[h.FullAddress.str.contains('COMMON'),['Street','StreetType']].groupby('Street').count()
# Address' that have 'CMN in StreetType
#h.loc[h.StreetType.str.contains('CMN').fillna(0) == True,['FullAddress', 'Street','StreetType']]

In [8]:
h.loc[:,'StreetTypeClean'] = h['StreetType']
# Cleaning the data by setting all Full Address' that include ' COMMON' to have StreetType 'CMN'
h.loc[h.FullAddress.str.contains(' COMMON') == True,['StreetTypeClean']] = 'CMN'
# Same for ' GREEN' => 'GRN', and the two cross street parital address' to 'UKN'
h.loc[h.FullAddress.str.contains(' GREEN') == True,['StreetTypeClean']] = 'GRN'
h.loc[h.FullAddress.str.contains('/') == True,['StreetTypeClean']] = 'UKN'

In [9]:
# check code
h.loc[(h.FullAddress.str.contains('COMMON') == True)
      |(h.FullAddress.str.contains(' GREEN') == True)
      |(h.FullAddress.str.contains('/') == True)
      ,['FullAddress', 'Street','StreetType', 'StreetTypeClean']]

t = h[['FullAddress', 'Street','StreetType', 'StreetTypeClean']].groupby('Street').count()
t.sum()

FullAddress        6930
StreetType         6097
StreetTypeClean    6930
dtype: int64

### Adding and cleaning CityArea
Using the Precinct information and a City Area look up table I was able to add a City Area to each HouseHold.

In [10]:
h = pd.merge(h, cityareas, on='Precinct', how='left')

In [11]:
h[['CityArea']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6930 entries, 0 to 6929
Data columns (total 1 columns):
CityArea    6815 non-null object
dtypes: object(1)
memory usage: 108.3+ KB


The initial look up table was missing two precincts, I identified the precincts.

In [12]:
h.loc[h.CityArea.isnull() == True,['Household_Id', 'Precinct']].groupby('Precinct').count()

Unnamed: 0_level_0,Household_Id
Precinct,Unnamed: 1_level_1
831730,111
835050,4


I identified the steets affected by the missing precincts:

In [13]:
PrecinctWithMissingCityArea = [831730, 835050]
h.loc[h['Precinct'].isin(PrecinctWithMissingCityArea), [
    'FullAddress','Street','CityArea','Precinct']].fillna('missing').groupby([
    'Precinct','CityArea','Street']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FullAddress
Precinct,CityArea,Street,Unnamed: 3_level_1
831730,missing,APPLETREE,9
831730,missing,CHERRY BLOSSOM,9
831730,missing,FERNWOOD,6
831730,missing,FREESIA,9
831730,missing,FRUITWOOD,12
831730,missing,JASMINE,14
831730,missing,LAURUS,11
831730,missing,PEACHTREE,11
831730,missing,RIVERWALK,15
831730,missing,ROSETREE,12


The 831730 streets are all grouped in one fairly recent development around the Riverwalk road, using a map to see the other nearby streets I was able to identify the best `CityArea` for this `Precinct` is `Downtown / BART`

In [14]:
h.loc[(h.Street.str.contains('RIVERWALK'))
       | (h.Street.str.contains('TEMPLE'))
       | (h.Street.str.contains('BISHOP'))
       | (h.Street.str.contains('ARLENE'))
       | (h.Street.str.contains('GREENWOOD')),
      ['FullAddress', 'Street', 'CityArea', 'Precinct']].fillna('missing').groupby([
    'Precinct','CityArea','Street']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FullAddress
Precinct,CityArea,Street,Unnamed: 3_level_1
831730,missing,RIVERWALK,15
835040,Downtown / BART,BISHOP,55
835410,Downtown / BART,TEMPLE,11


Chauntry Common is another development where most of the houses are in one precinct and a few on the edge are in another, together with the corner house with address on Peralta Blvd. The best `CityArea` for this precinct is `Centerville`.

Note: Peralta is a very long road with residents voting in 5 different precincts, Chauntry Common is near the 3000 block of Peralta. 

In [15]:
h.loc[h.Street.str.contains('CHAUNTRY')
      | ((h.Street.str.contains('PERALTA')) & (h.HouseNumber > 3000))
      ,['HouseNumber','Street', 'CityArea', 'Precinct']].fillna('missing').sort_values('HouseNumber').groupby([
    'Precinct','CityArea','Street']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HouseNumber
Precinct,CityArea,Street,Unnamed: 3_level_1
835010,Centerville,CHAUNTRY COMMON,16
835010,Centerville,PERALTA,2
835050,missing,CHAUNTRY COMMON,3
835050,missing,PERALTA,1


In [16]:
h.loc[:,'CityAreaClean'] = h['CityArea']
# setting the CityArea for the 'missing' Precincts
h.loc[h.Precinct == 831730, 'CityAreaClean'] = 'Downtown / BART'
h.loc[h.Precinct == 835050, 'CityAreaClean'] = 'Centerville'

# validate
h.loc[h['Precinct'].isin(PrecinctWithMissingCityArea), [
    'FullAddress','Street','CityAreaClean','Precinct']].groupby([
    'Precinct','CityAreaClean','Street']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,FullAddress
Precinct,CityAreaClean,Street,Unnamed: 3_level_1
831730,Downtown / BART,APPLETREE,9
831730,Downtown / BART,CHERRY BLOSSOM,9
831730,Downtown / BART,FERNWOOD,6
831730,Downtown / BART,FREESIA,9
831730,Downtown / BART,FRUITWOOD,12
831730,Downtown / BART,JASMINE,14
831730,Downtown / BART,LAURUS,11
831730,Downtown / BART,PEACHTREE,11
831730,Downtown / BART,RIVERWALK,15
831730,Downtown / BART,ROSETREE,12


### Cleaning the Zip data
Some zips had been entered with varying levels of accuracy. I cleaned them all to a consistent 5 digit zip code

In [17]:
h['ZipClean'] = h.Zip.astype(str).str[0:5]
h['ZipClean'].value_counts()

94538    3835
94536    3095
Name: ZipClean, dtype: int64

### Creating an isApt field
A True/False column denoting if the household has an Apt number.

In [18]:
h['isApt'] = h.ApartmentNumber.notnull()
h.isApt.value_counts()

False    4922
True     2008
Name: isApt, dtype: int64

### Shuffling the data to create new ID

In [19]:
# randomly shuffling the household row and reset index to make the new order the index
h = h.sample(frac=1).reset_index(drop=True)
h.index.name = 'Hid'
h = h.reset_index()

In [20]:
hid_lookup = h[['Hid', 'Household_Id']]

### Creating the output clean files

In [21]:
clean = h.loc[:,['Hid', 'StreetTypeClean', 'ZipClean', 'Precinct', 'PrecinctSub',
                 'District', 'CityAreaClean', 'isApt']]
clean.rename(columns={'CityAreaClean':'CityArea','ZipClean':'Zip',
                      'StreetTypeClean':'StreetType'}, inplace=True)
clean[clean.select_dtypes(['object']).columns] = clean.select_dtypes(['object'
                                                            ]).apply(lambda 
                                                                     x: x.astype('category'))
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6930 entries, 0 to 6929
Data columns (total 8 columns):
Hid            6930 non-null int64
StreetType     6930 non-null category
Zip            6930 non-null category
Precinct       6930 non-null int64
PrecinctSub    6930 non-null int64
District       6930 non-null int64
CityArea       6930 non-null category
isApt          6930 non-null bool
dtypes: bool(1), category(3), int64(4)
memory usage: 244.7 KB


In [22]:
clean.set_index('Hid', inplace=True)
hid_lookup.set_index('Hid', inplace=True)

In [23]:
date = pd.Timestamp("today").strftime("%Y%m%d")
clean.to_csv('data_clean/{}_households_district3.csv'.format(date))
hid_lookup.to_csv('data_clean/{}_households_lookup_NO_GIT.csv'.format(date))