In [17]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import matplotlib.pyplot as plt
%matplotlib notebook

from pathlib import Path

In [2]:
PROJECT_PATH = Path('/home/ubuntu/flatworld/')
DATA_DIR = PROJECT_PATH / 'data'

In [38]:
with (DATA_DIR / 'census_income_metadata.txt').open('r') as fin:
    metadata_desc = fin.read()
metadata = metadata_desc.split('|')
metadata = [md.strip() for md in metadata][1:]
metadata

['This data was extracted from the census bureau database found at',
 'http://www.census.gov/ftp/pub/DES/www/welcome.html',
 'Donor: Terran Lane and Ronny Kohavi',
 'Data Mining and Visualization',
 'Silicon Graphics.',
 'e-mail: terran@ecn.purdue.edu, ronnyk@sgi.com for questions.',
 '',
 'The data was split into train/test in approximately 2/3, 1/3',
 "proportions using MineSet's MIndUtil mineset-to-mlc.",
 '',
 'Prediction task is to determine the income level for the person',
 'represented by the record.  Incomes have been binned at the $50K',
 'level to present a binary classification problem, much like the',
 'original UCI/ADULT database.  The goal field of this data, however,',
 'was drawn from the "total person income" field rather than the',
 '"adjusted gross income" and may, therefore, behave differently than the',
 'orginal ADULT goal field.',
 '',
 'More information detailing the meaning of the attributes can be',
 'found in http://www.bls.census.gov/cps/cpsmain.htm',
 'To 

In [39]:
col_info = [xx for xx in metadata if 'attribute #' in xx]

col_distinct = [col.split(' ')[0] for col in col_info]
col_nums = [int(xx.split('#')[1][:2]) for xx in col_info]
col_types = [col[(col.find(')') + 2):] for col in col_info]
col_names = [col[col.find('(')+1:col.find(')')] for col in col_info]
# NOTE: Having chosen to pull column names from these lines was a bit unfortunate because
#  the 24th entry in the csv is actually the instance weight as shown in lines 24-68 of
#  the metadata text file. This is skipped over in these lines, so attribute # doesn't
#  actually correspond to column location in the csv. Seems like this should be
#  pointed out more explicitly somewhere in the file.

col_info_df = pd.DataFrame([xx for xx in zip(col_names, col_types, col_distinct)], columns=['col_names', 'col_type', 'col_distinct'], index=col_nums)
col_info_df.head()

Unnamed: 0,col_names,col_type,col_distinct
0,age,continuous,91
1,class of worker,nominal,9
2,detailed industry recode,nominal,52
3,detailed occupation recode,nominal,47
4,education,nominal,17


In [49]:
idxs = [xx for xx in range(42)]
idxs.pop(24)
idxs
col_names.append('target')
col_name_map = dict(zip(idxs, col_names))
col_name_map

{0: 'age',
 1: 'class of worker',
 2: 'detailed industry recode',
 3: 'detailed occupation recode',
 4: 'education',
 5: 'wage per hour',
 6: 'enroll in edu inst last wk',
 7: 'marital stat',
 8: 'major industry code',
 9: 'major occupation code',
 10: 'race',
 11: 'hispanic origin',
 12: 'sex',
 13: 'member of a labor union',
 14: 'reason for unemployment',
 15: 'full or part time employment stat',
 16: 'capital gains',
 17: 'capital losses',
 18: 'dividends from stocks',
 19: 'tax filer stat',
 20: 'region of previous residence',
 21: 'state of previous residence',
 22: 'detailed household and family stat',
 23: 'detailed household summary in household',
 25: 'migration code-change in msa',
 26: 'migration code-change in reg',
 27: 'migration code-move within reg',
 28: 'live in this house 1 year ago',
 29: 'migration prev res in sunbelt',
 30: 'num persons worked for employer',
 31: 'family members under 18',
 32: 'country of birth father',
 33: 'country of birth mother',
 34: 'coun

In [50]:
census_df = pd.read_csv(DATA_DIR / 'census_income_learn.csv',header=None)
instance_weights = census_df.pop(24)
census_df.rename(columns=col_name_map, inplace=True)
census_df

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,race,hispanic origin,sex,member of a labor union,reason for unemployment,full or part time employment stat,capital gains,capital losses,dividends from stocks,tax filer stat,region of previous residence,state of previous residence,detailed household and family stat,detailed household summary in household,migration code-change in msa,migration code-change in reg,migration code-move within reg,live in this house 1 year ago,migration prev res in sunbelt,num persons worked for employer,family members under 18,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,target
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Other Rel 18+ ever marr not in subfamily,Other relative of householder,?,?,?,Not in universe under 1 year old,?,0,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Head of household,South,Arkansas,Householder,Householder,MSA to MSA,Same county,Same county,No,Yes,1,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,Asian or Pacific Islander,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Child 18+ never marr Not in a subfamily,Child 18 or older,?,?,?,Not in universe under 1 year old,?,0,Not in universe,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
5,48,Private,40,10,Some college but no degree,1200,Not in universe,Married-civilian spouse present,Entertainment,Professional specialty,Amer Indian Aleut or Eskimo,All other,Female,No,Not in universe,Full-time schedules,0,0,0,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,?,?,?,Not in universe under 1 year old,?,1,Not in universe,Philippines,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,95,- 50000.
6,42,Private,34,3,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Finance insurance and real estate,Executive admin and managerial,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,5178,0,0,Joint both under 65,Not in universe,Not in universe,Householder,Householder,Nonmover,Nonmover,Nonmover,Yes,Not in universe,6,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
7,28,Private,4,40,High school graduate,0,Not in universe,Never married,Construction,Handlers equip cleaners etc,White,All other,Female,Not in universe,Job loser - on layoff,Unemployed full-time,0,0,0,Single,Not in universe,Not in universe,Secondary individual,Nonrelative of householder,?,?,?,Not in universe under 1 year old,?,4,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,30,95,- 50000.
8,47,Local government,43,26,Some college but no degree,876,Not in universe,Married-civilian spouse present,Education,Adm support including clerical,White,All other,Female,No,Not in universe,Full-time schedules,0,0,0,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,?,?,?,Not in universe under 1 year old,?,5,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.
9,34,Private,4,37,Some college but no degree,0,Not in universe,Married-civilian spouse present,Construction,Machine operators assmblrs & inspctrs,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Joint both under 65,Not in universe,Not in universe,Householder,Householder,Nonmover,Nonmover,Nonmover,Yes,Not in universe,6,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.


In [41]:
?pd.read_csv

In [8]:
with (DATA_DIR / 'census_income_learn.csv').open('r') as fin:
    census_raw = fin.read()

In [12]:
census_raw = census_raw.split('\n')

In [16]:
len(census_raw[0].split(','))

42