In [98]:
import pandas as pd
import pickle
from typing import List

class DataPrep:

    df: pd.DataFrame
    metadata: str
    col_names: List[str]

    def __init__(self, data_path: str):
        self.data_path = data_path
        self.inhale_metadata()

    def create_dataframe(self, print_=False):
        self.df = pd.read_csv(self.data_path, header=None)
        if print_:
            print(self.df.head(3))

    def inhale_metadata(self):
        with open('./census_data/census_income_metadata.txt', 'r') as f:
            self.metadata = f.read()

    def get_col_names(self, print_=False):
        self.col_names = [attr.split(':')[0] for attr in self.metadata.split('- 50000, 50000+.')[1].strip().split('\n')]
        self.col_names.remove('| instance weight'), self.col_names.append('target')
        if print_:
            print(self.col_names)

    def set_col_names(self, print_=False):
        self.df.columns = self.col_names
        if print_:
            print(self.df.head())

    def feature_engineering(self):
        """metadata indicated this feature should be dropped"""
        self.df.drop('instance weight', axis=1, inplace=True)

        """a single individual can't have both a capital gain AND a capital loss so condense those"""
        self.df['capital_change'] = self.df['capital gains'] - self.df['capital losses']
        self.df.drop(['capital gains', 'capital losses'], axis=1, inplace=True)

        """replace the target feature string values with integers"""
        self.df['target'].replace({' - 50000.': -1, ' 50000+.': 1}, inplace=True)

    def get_val_counts(self, cols: List[str]=None):
        if cols is None: cols = self.df.columns
        for col in cols:
                print(f'{col}: \n{self.df[col].value_counts()}\n')

    def apply_all(self, filename: str): #####TODO: add all methods to this method
        self.create_dataframe()
        if self.metadata is None: self.inhale_metadata()
        self.get_col_names()
        self.set_col_names()

        self.pickle_data(filename)


    def pickle_data(self, filename: str):
        with open(f'./pickles/{filename}.obj', 'wb') as f:
            pickle.dump(self.df, f)

In [99]:
train_data = DataPrep('./census_data/census_income_learn.csv')

In [100]:
train_data.create_dataframe(print_=True)

   0                                1   2   3                            4   \
0  73                  Not in universe   0   0         High school graduate   
1  58   Self-employed-not incorporated   4  34   Some college but no degree   
2  18                  Not in universe   0   0                   10th grade   

   5                 6               7                             8   \
0   0   Not in universe         Widowed   Not in universe or children   
1   0   Not in universe        Divorced                  Construction   
2   0       High school   Never married   Not in universe or children   

                                     9   ...              32              33  \
0                       Not in universe  ...   United-States   United-States   
1   Precision production craft & repair  ...   United-States   United-States   
2                       Not in universe  ...         Vietnam         Vietnam   

               34                                    35 36           

In [101]:
train_data.inhale_metadata()
train_data.get_col_names(print_=True)

['age', 'class of worker', 'detailed industry recode', 'detailed occupation recode', 'education', 'wage per hour', 'enroll in edu inst last wk', 'marital stat', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment', 'full or part time employment stat', 'capital gains', 'capital losses', 'dividends from stocks', 'tax filer stat', 'region of previous residence', 'state of previous residence', 'detailed household and family stat', 'detailed household summary in household', 'instance weight', 'migration code-change in msa', 'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'num persons worked for employer', 'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', 'own business or self employed', "fill inc questionnaire for veteran's admin", 'veterans benefits', 'week

In [102]:
train_data.set_col_names(print_=True)

   age                  class of worker  detailed industry recode  \
0   73                  Not in universe                         0   
1   58   Self-employed-not incorporated                         4   
2   18                  Not in universe                         0   
3    9                  Not in universe                         0   
4   10                  Not in universe                         0   

   detailed occupation recode                    education  wage per hour  \
0                           0         High school graduate              0   
1                          34   Some college but no degree              0   
2                           0                   10th grade              0   
3                           0                     Children              0   
4                           0                     Children              0   

  enroll in edu inst last wk    marital stat           major industry code  \
0            Not in universe         Widowed

In [103]:
train_data.feature_engineering()

In [104]:
train_data.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 40 columns):
 #   Column                                      Non-Null Count   Dtype 
---  ------                                      --------------   ----- 
 0   age                                         199523 non-null  int64 
 1   class of worker                             199523 non-null  object
 2   detailed industry recode                    199523 non-null  int64 
 3   detailed occupation recode                  199523 non-null  int64 
 4   education                                   199523 non-null  object
 5   wage per hour                               199523 non-null  int64 
 6   enroll in edu inst last wk                  199523 non-null  object
 7   marital stat                                199523 non-null  object
 8   major industry code                         199523 non-null  object
 9   major occupation code                       199523 non-null  object
 10  race    

In [105]:
train_data.get_val_counts()

age: 
34    3489
35    3450
36    3353
31    3351
33    3340
      ... 
85     423
86     348
87     301
88     241
89     195
Name: age, Length: 91, dtype: int64

class of worker: 
 Not in universe                   100245
 Private                            72028
 Self-employed-not incorporated      8445
 Local government                    7784
 State government                    4227
 Self-employed-incorporated          3265
 Federal government                  2925
 Never worked                         439
 Without pay                          165
Name: class of worker, dtype: int64

detailed industry recode: 
0     100684
33     17070
43      8283
4       5984
42      4683
45      4482
29      4209
37      4022
41      3964
32      3596
35      3380
39      2937
34      2765
44      2549
2       2196
11      1764
50      1704
40      1651
47      1644
38      1629
24      1503
12      1350
19      1346
30      1181
31      1178
25      1084
9        993
22       952
36       945

In [107]:
# train_data.pickle_data('df_train')

In [108]:
# test_data = DataPrep('./census_data/census_income_test.csv')
# test_data.apply_all('df_test')