In [11]:
"""
Title:       Build a preprocessing pipeline that helps user preprocess training
             and test data from the corresponding CSV input files.

Description: Fill in missing values, discretize continuous variables, generate
             new features, deal with categorical variables with multiple levels,
             scale data, and save preprocessed data.

Author:      Kunyu He, CAPP'20, The University of Chicago

"""

import warnings

warnings.filterwarnings("ignore")

import argparse
import logging
import os
import re
import time

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler


#----------------------------------------------------------------------------#
INPUT_DIR = ""
OUTPUT_DIR = ""
LOG_DIR = "../logs/featureEngineering/"

TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
TRAIN_FEATURES_FILE = 'train_features.txt'
TEST_FEATURES_FILE = 'test_features.txt'

# logging
logger= logging.getLogger('featureEngineering')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
logger.addHandler(ch)
fh = logging.FileHandler(LOG_DIR + time.strftime("%Y%m%d-%H%M%S") + '.log')
logger.addHandler(fh)

pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings("ignore")

In [14]:
def read_data(file_name, drop_na=False):
    """
    Read credit data in the .csv file and data types from the .json file.

    Inputs:
        - data_file (string): name of the data file.
        - drop_na (bool): whether to drop rows with any missing values

    Returns:
        (DataFrame) clean data set with correct data types

    """
    data = pd.read_csv(INPUT_DIR + file_name)

    if drop_na:
        data.dropna(axis=0, inplace=True)

    return data

In [15]:
test = read_data(TEST_FILE)

In [16]:
test.columns

Index(['elevation', 'DIS_LAKE', 'DIS_MAJOR_RIVER', 'DIS_OCEAN', 'DIS_RIVER',
       'PRECAVNEW80_08', 'TEMPAV_8008', 'ethin_div', 'HighRelig', 'ChrCatP',
       'ReligCatP', 'year', 'loc_id', 'MER_40', 'POPGPW_40', 'attacked',
       'nkill_past_5', 'nwound_past_5', 'attacktype_past_5',
       'targettype_past_5', 'group_name_past_5'],
      dtype='object')

In [19]:
test.isnull().sum()

elevation             0
DIS_LAKE              7
DIS_MAJOR_RIVER       7
DIS_OCEAN             7
DIS_RIVER             7
PRECAVNEW80_08       18
TEMPAV_8008          18
ethin_div             0
HighRelig             0
ChrCatP               0
ReligCatP             0
year                  0
loc_id                0
MER_40               76
POPGPW_40             0
attacked              0
nkill_past_5          0
nwound_past_5         0
attacktype_past_5     0
targettype_past_5     0
group_name_past_5     0
dtype: int64

In [23]:
test.ReligCatP.value_counts()

95-100%    368
60-75%     146
75-85%     122
90-95%     100
85-90%      77
40-60%      70
10-40%       6
Name: ReligCatP, dtype: int64