In [5]:
import pandas as pd
from typing import List

class DataPrep:

    df: pd.DataFrame
    metadata: str
    col_names: List[str]

    def __init__(self, data_path: str):
        self.data_path = data_path
        self.inhale_metadata()

    def create_dataframe(self, print_=False):
        self.df = pd.read_csv(self.data_path, header=None)
        if print_:
            print(self.df.head(3))

    def inhale_metadata(self):
        with open('./census_data/census_income_metadata.txt', 'r') as f:
            self.metadata = f.read()

    def get_col_names(self, print_=False):
        self.col_names = [attr.split(':')[0] for attr in self.metadata.split('- 50000, 50000+.')[1].strip().split('\n')]
        self.col_names.remove('| instance weight'), self.col_names.append('target')
        if print_:
            print(self.col_names)

    def set_col_names(self, print_=False):
        self.df.columns = self.col_names
        if print_:
            print(self.df.head())


In [6]:
data_prep = DataPrep('./census_data/census_income_learn.csv')

In [7]:
data_prep.create_dataframe(print_=True)

   0                                1   2   3                            4   \
0  73                  Not in universe   0   0         High school graduate   
1  58   Self-employed-not incorporated   4  34   Some college but no degree   
2  18                  Not in universe   0   0                   10th grade   

   5                 6               7                             8   \
0   0   Not in universe         Widowed   Not in universe or children   
1   0   Not in universe        Divorced                  Construction   
2   0       High school   Never married   Not in universe or children   

                                     9   ...              32              33  \
0                       Not in universe  ...   United-States   United-States   
1   Precision production craft & repair  ...   United-States   United-States   
2                       Not in universe  ...         Vietnam         Vietnam   

               34                                    35 36           

In [8]:
data_prep.inhale_metadata()
data_prep.get_col_names(print_=True)

['age', 'class of worker', 'detailed industry recode', 'detailed occupation recode', 'education', 'wage per hour', 'enroll in edu inst last wk', 'marital stat', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment', 'full or part time employment stat', 'capital gains', 'capital losses', 'dividends from stocks', 'tax filer stat', 'region of previous residence', 'state of previous residence', 'detailed household and family stat', 'detailed household summary in household', 'instance weight', 'migration code-change in msa', 'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'num persons worked for employer', 'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', 'own business or self employed', "fill inc questionnaire for veteran's admin", 'veterans benefits', 'week

In [9]:
data_prep.set_col_names(print_=True)

   age                  class of worker  detailed industry recode  \
0   73                  Not in universe                         0   
1   58   Self-employed-not incorporated                         4   
2   18                  Not in universe                         0   
3    9                  Not in universe                         0   
4   10                  Not in universe                         0   

   detailed occupation recode                    education  wage per hour  \
0                           0         High school graduate              0   
1                          34   Some college but no degree              0   
2                           0                   10th grade              0   
3                           0                     Children              0   
4                           0                     Children              0   

  enroll in edu inst last wk    marital stat           major industry code  \
0            Not in universe         Widowed