# Cleaning and Testing

In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

In [2]:
data = pd.read_csv('/data/cewalden/soc16.csv')

Made a copy of the dataframe to perform cleaning and formatting on.

Dropped the CLOS column as it is not applicable for years after 2007

In [3]:
df = data.copy().drop('CLOS', axis = 1)
df.head()

Unnamed: 0,ACS,AGER,ASSOC,BASE,CAT,CON,DECK,DET,DIV,FINC,...,FFNSQ_F,SLPR_F,FSLPR_F,CONPR_F,FCONPR_F,LOTV_F,SQFS_F,FSQFS_F,PVALU_F,AUTH
0,2,2,1,1,1,2,1,2,1,0,...,0,0,0,0,0,0,0,0,0,201510
1,1,2,1,1,1,1,1,2,1,1,...,0,0,0,0,0,0,0,0,0,201605
2,1,2,2,1,1,2,1,1,1,0,...,0,0,0,0,0,0,0,0,0,201506
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,201609
4,1,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,201511


In [4]:
print(len(df))

22661


## Cleaning

I decided to make a column of use pandas' datetime objects as well as using the provided "YYYYMM" format

In [4]:
def toDate(x):
    """Tidies and converts a column of integers to a column of dates
    >>> Unusable numbers are changed to 2010-01-01
    
    Parameters
    ----------
    x: pd.Series
        A column of integers that are either 0 or in a YYYYMM format
    
    Returns
    -------
    dates: pd.Series
        A column of pandas date-time objects"""
    tempCol = x.astype(str)
    tempCol = tempCol.replace('0', '201001')
    return pd.to_datetime(tempCol, format="%Y%m")

def reformatDf(df):
    """Overhead function for reformatting a dataframe
    >>> Drops and replaces columns 
    
    Parameters
    ----------
    df: pd.DataFrame
        A dataframe with certain columns formatted to be integer representations of dates
        
    Local Variables
    ---------------
    colsDate: array
        An array of the names of columns of integers to replace with dates
    
    Returns
    -------
    df: pd.DataFrame
        A dataframe with dates represented by pandas datetime objects"""
    colsDate = ['COMP', 'SALE', 'STRT', 'AUTH']
    for i in colsDate:
        tempCol = toDate(df[i])
        df[i+'_D'] = tempCol
    return df

In [5]:
df = reformatDf(df)

In [6]:
#0 is the default missing value
df['FOYER']= df['FOYER'].replace(np.nan, 0).astype(int)
df['FRAME']= df['FRAME'].replace(np.nan, 0).astype(int)
df['MFGS']= df['MFGS'].replace(np.nan, 0).astype(int)

## Testing

In [7]:
def checkIfEqualToSet(dataCol, setList):
    """Asserts that two unordered lists of unique values have the same contents
    
    Parameters
    ----------
    dataCol: np.ndarray
        A unique array of numbers to compare to setList
    setList: np.ndarray
        A unique array of numbers to compare dataCol against"""
    for i in range(len(setList)):
        assert dataCol[i] in setList
    assert len(setList) == len(dataCol)

In [8]:
for i in df.columns:
    df[i] = df[i].replace(np.nan, 0)

#### Assertion for categorical columns

In [9]:
flags = ['AREA_F', 'FNSQ_F', 'SLPR_F', 'FSLPR_F', 
         'CONPR_F', 'FCONPR_F', 'LOTV_F', 
         'SQFS_F', 'FSQFS_F', 'PVALU_F']
for i in flags:
    checkIfEqualToSet(df[i].unique(), [0,1])
    
zeroToThree = ['ACS', 'AGER', 'ASSOC', 'CON', 
               'DECK', 'DET', 'FNBS', 'FOYER', 
               'PATI', 'PRCH', 'WALS']
for i in zeroToThree:
    checkIfEqualToSet(df[i].unique(), [0,1,2])
    
zeroToFour = ['BASE', 'FRAME', 'GAR', 'HEAT', 'HEAT2']
for i in zeroToFour:
    checkIfEqualToSet(df[i].unique(), [0,1,2,3,4])
    
zeroToThree = ['MFGS', 'SEWER', 'STOR', 'WATER']
for i in zeroToThree:
    checkIfEqualToSet(df[i].unique(), [0,1,2,3])
    
checkIfEqualToSet(df['CAT'].unique(), [1,2,3,4])
checkIfEqualToSet(df['DIV'].unique(), [1,2,3,4,5,6,7,8,9])
checkIfEqualToSet(df['FINC'].unique(), [0,1,2,3,4,5])
checkIfEqualToSet(df['LNDR'].unique(), [0,1,2,3,4,5,6])
checkIfEqualToSet(df['METRO'].unique(), [1,2])
checkIfEqualToSet(df['WAL1'].unique(), [0,1,2,4,5,6,7,8,9])
checkIfEqualToSet(df['WAL2'].unique(), [0,1,2,4,5,6,7,8,9])
checkIfEqualToSet(df['BEDR'].unique(), [0,2,3,4,5])
checkIfEqualToSet(df['FPLS'].unique(), [0,1,2,9])
checkIfEqualToSet(df['FULB'].unique(), [1,2,3,4,9])
checkIfEqualToSet(df['HAFB'].unique(), [0,1,2,9])
checkIfEqualToSet(df['FUEL'].unique(), [0,1,2,3,4,5])
checkIfEqualToSet(df['FUEL2'].unique(), [0,1,2,3,4,5])
checkIfEqualToSet(df['FFNSQ_F'].unique(), [0])

Zero was the default missing value for most columns.

Some column oddness included:
* SEWER: The description said that the range was 1, 2, and used 3 as other. However, there were over 300 rows that had 0 as the value. I assumed it was still being used as the missing value and did nothing with it
* WAL1 and WAL2: Both included "Aluminum siding (not covered with vinyl)" as a siding option, however there was no house that actually used it
* FFNSQ_F: Did not indicate that the final square foot area of finished basement at time of completion had been changed

Quantitative columns:
* AREA
* FNSQ
* FFNSQ
* LOTV
* PVALU
* FSQFS
* CONPR
* SLPR
* SQFS
* WEIGHT
* FCONPR
* FSLPR

Columns containing dates:
* COMP and COMP_D
* SALE and SALE_D
* STRT and STRT_D
* AUTH and AUTH_D

In [10]:
filename = '/data/cewalden/soc16.parquet'
table = pa.Table.from_pandas(df)
pq.write_table(table, filename)