In [2]:
import json
_preprocessing_file_PATH = 'preprocessing.json'
with open(_preprocessing_file_PATH) as f:
    preprocessing_param = json.load(f)

print(preprocessing_param)

{'_comment': {'_col_to_load': 'columns important to the target', '_train_val_eval': 'train val and evaluation split ratio', '_fill_strategy': 'missing value imputation strategy', '_drop_strategy': 'dropping strategy, 1 indicate axis 1(column)', '_col_to_dummied': 'feature that will be dummied, leave empty to dummy all feature', '_processed_data_PATH': 'path to dump the prosessed data'}, 'col_to_load': [], 'train_val_eval': [], 'fill_strategy': {'Age': 'Median', 'Fare': 'Median', 'Embarked': 'Mode'}, 'drop_strategy': {'Age': 1, 'Name': 1, 'Fare': 1}, 'col_to_dummied': [], 'processed_data_PATH': ''}


In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
import sys
import json
from sklearn.preprocessing import LabelEncoder

class Preprocess():

    def __init__(self):
        print("Preprocess object created")

    def fillna(self, data, fill_strategies):
        """
        Fill missing values for each column using 'Zero', or feature 'Mode', 'Mean', or 'Median'
        :param data: dataset information will be gathered from
        :fill_strategies: strategies of filling missing value e.g. 'Mode', 'Mean', or 'Median' etc.
        :return: the processed dataframe
        """
        for column, strategy in fill_strategies.items():
            if strategy == 'None':
                data[column] = data[column].fillna('None')
            elif strategy == 'Zero':
                data[column] = data[column].fillna(0)
            elif strategy == 'Mode':
                data[column] = data[column].fillna(data[column].mode()[0])
            elif strategy == 'Mean':
                data[column] = data[column].fillna(data[column].mean())
            elif strategy == 'Median':
                data[column] = data[column].fillna(data[column].median())
            else:
                print("{}: There is no such thing as preprocess strategy".format(strategy))

        return data

    def drop(self, data, drop_strategies):
        """
        Dropping the non informative features
        :param data: dataset information will be gathered from
        :fill_strategies: strategies of filling missing value e.g. 'Mode', 'Mean', or 'Median' etc.
        :return: the processed dataframe
        """
        for column, strategy in drop_strategies.items():
            data=data.drop(labels=[column], axis=strategy)

        return data


    def _label_encoder(self,data):
        """
        Encoding the categorical variable
        :param data: dataset information will be gathered from
        :return: the processed dataframe
        """
        
        cols = data.columns
        num_cols = data._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        

        labelEncoder=LabelEncoder()
        for column in cat_cols:
            print(column)
            labelEncoder.fit(data[column])
            data[column]=labelEncoder.transform(data[column])
        return data

    def _get_dummies(self, data, prefered_columns=None):

        if prefered_columns is None:
            columns=data.columns.values
            non_dummies=None
        else:
            non_dummies=[col for col in data.columns.values if col not in prefered_columns]

            columns=prefered_columns


        dummies_data=[pd.get_dummies(data[col],prefix=col) for col in columns]

        if non_dummies is not None:
            for non_dummy in non_dummies:
                dummies_data.append(data[non_dummy])

        return pd.concat(dummies_data, axis=1)

In [36]:
data_path = '/home/akash/Desktop/AmEx/data/titanic.csv'
dataset = pd.read_csv(data_path)
display(dataset)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [37]:
#'Age': 1, 'Name': 1, 'Fare': 1
pp = Preprocess()

newdf = pp.drop(dataset, {'Age': 1, 'Name': 1, 'Fare': 1, 'Cabin':1})
newdf = pp.fillna(newdf, {'Embarked': 'Mode'})
display(newdf)

Preprocess object created


Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Embarked
0,1,0,3,male,1,0,A/5 21171,S
1,2,1,1,female,1,0,PC 17599,C
2,3,1,3,female,0,0,STON/O2. 3101282,S
3,4,1,1,female,1,0,113803,S
4,5,0,3,male,0,0,373450,S
...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,0,211536,S
887,888,1,1,female,0,0,112053,S
888,889,0,3,female,1,2,W./C. 6607,S
889,890,1,1,male,0,0,111369,C


In [38]:
newdf2 = pp._label_encoder(newdf)
display(newdf2)

Sex
Embarked
Ticket


Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Embarked
0,1,0,3,1,1,0,523,2
1,2,1,1,0,1,0,596,0
2,3,1,3,0,0,0,669,2
3,4,1,1,0,1,0,49,2
4,5,0,3,1,0,0,472,2
...,...,...,...,...,...,...,...,...
886,887,0,2,1,0,0,101,2
887,888,1,1,0,0,0,14,2
888,889,0,3,0,1,2,675,2
889,890,1,1,1,0,0,8,0


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
import sys
import json

from preprocessing import Information, Preprocess

"""
Helper script to run the proprocessing functions
"""

class runPreprocess():

    def __init__(self, data_input_path):
        self._input_path = data_input_path
        self.data = None
        self.ppc_parameters = None

        self._preprocessor  =Preprocess()
        self._getInfo = Information()


        print("Please Ensure to fill up the 'preprocessing.json' file first!")
        
        _preprocessing_file_PATH = 'preprocessing.json'
        with open(_preprocessing_file_PATH) as f:
            preprocessing_param = json.load(f)

        self.ppc_parameters = preprocessing_param

    def _read_csv_file(self):
        col_to_read = self.ppc_parameters["col_to_load"]
        if len(col_to_read):
            return pd.read_csv(self._input_path, usecols=col_to_read, index_col=0)
        else:
            return pd.read_csv(self._input_path)

    def get_dataset_info(self):
        self.data = self._read_csv_file()
        self._getInfo.info(self.data)

    def start_preprocessing(self):
        self._strategy()
        return self.data
        

    def _strategy(self):
        drop_strategy = self.ppc_parameters["drop_strategy"]
        print(drop_strategy)
        
        if drop_strategy:
            self.data = self._preprocessor.drop(self.data, drop_strategy)

        fill_strategy = self.ppc_parameters["fill_strategy"]
        
        if fill_strategy:
            self.data = self._preprocessor.fillna(self.data, fill_strategy)

        self.data = self._preprocessor._label_encoder(self.data)

        pfcol = self.ppc_parameters["col_to_dummied"]
        
        if pfcol:
            self.data=self._preprocessor._get_dummies(self.data, prefered_columns=pfcol)
        else:
            self.data=self._preprocessor._get_dummies(self.data, prefered_columns=None)
        
        target_col = self.ppc_parameters["target_col"]
        feature_col = [col for col in self.data.columns if col is not target_col]
        
        
        self.data = self._preprocessor._get_normalized(self.data, feature_col)
        

In [1]:
from pp_runner import runPreprocess
data_path = '/home/akash/Desktop/AmEx/data/loan.csv'

ppr = runPreprocess(data_path)

Preprocess object created
Information object created
Please Ensure to fill up the 'preprocessing.json' file first!


In [2]:
ppr.get_dataset_info()

Reading the dataset...
No of rows:  300000
No of columns:  12
Memory Used:  99878423
FEATURE NAME     DATA FORMAT      # OF MISSING VALUES       10 SAMPLES      
dti                float64             654                 18.24,26.52,10.51,16.74,26.35,37.94,2.4,30.1,21.16,17.43,
last_pymnt_amnt    float64             0                   84.92,777.23,180.69,146.51,731.78,192.45,72.28,208.31,180.69,206.44,
total_pymnt        float64             0                   167.02,1507.11,353.89,286.71,1423.21,377.95,141.56,201.53,353.89,405.64,
out_prncp_inv      float64             0                   2386.02,29387.75,4787.21,3831.93,29339.02,5302.5,1914.71,5864.01,4786.79,5730.2,
out_prncp          float64             0                   2386.02,29387.75,4787.21,3831.93,29339.02,5302.5,1914.71,5864.01,4786.79,5730.2,
purpose            object              0                   debt_consolidation,debt_consolidation,debt_consolidation,debt_consolidation,debt_consolidation,credit_card,debt_consolidat

In [3]:
datss = ppr.start_preprocessing()
#ppr.get_dataset_info()

{'dti': 0, 'annual_inc': 0}
cols before droping:  Index(['loan_amnt', 'term', 'int_rate', 'grade', 'annual_inc', 'loan_status',
       'purpose', 'dti', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'last_pymnt_amnt'],
      dtype='object')
cols after droping:  Index(['loan_amnt', 'term', 'int_rate', 'grade', 'annual_inc', 'loan_status',
       'purpose', 'dti', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'last_pymnt_amnt'],
      dtype='object')
loan_status
purpose
term
grade


In [4]:
display(datss)

Unnamed: 0,purpose_0,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,...,term_1,loan_amnt,int_rate,annual_inc,loan_status,dti,out_prncp,out_prncp_inv,total_pymnt,last_pymnt_amnt
0,0,0,1,0,0,0,0,0,0,0,...,0,2500,13.56,55000.0,1,18.24,2386.02,2386.02,167.020000,84.92
1,0,0,1,0,0,0,0,0,0,0,...,1,30000,18.94,90000.0,1,26.52,29387.75,29387.75,1507.110000,777.23
2,0,0,1,0,0,0,0,0,0,0,...,0,5000,17.97,59280.0,1,10.51,4787.21,4787.21,353.890000,180.69
3,0,0,1,0,0,0,0,0,0,0,...,0,4000,18.94,92000.0,1,16.74,3831.93,3831.93,286.710000,146.51
4,0,0,1,0,0,0,0,0,0,0,...,1,30000,16.14,57250.0,1,26.35,29339.02,29339.02,1423.210000,731.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,0,0,1,0,0,0,0,...,0,15000,11.05,40000.0,1,21.45,11700.49,11700.49,4404.540000,491.44
299996,0,0,1,0,0,0,0,0,0,0,...,0,15000,14.52,35000.0,1,20.16,12205.56,12205.56,4107.560000,516.47
299997,0,0,0,1,0,0,0,0,0,0,...,1,25000,9.58,215000.0,1,5.34,21966.39,21966.39,4707.660000,526.03
299998,1,0,0,0,0,0,0,0,0,0,...,1,13000,26.77,62000.0,1,21.62,11964.44,11964.44,3537.290000,395.18


In [5]:
print(datss.columns)

Index(['purpose_0', 'purpose_1', 'purpose_2', 'purpose_3', 'purpose_4',
       'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9',
       'purpose_10', 'purpose_11', 'grade_0', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'term_0', 'term_1', 'loan_amnt',
       'int_rate', 'annual_inc', 'loan_status', 'dti', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'last_pymnt_amnt'],
      dtype='object')


In [10]:
data_path = '/home/akash/Desktop/AmEx/data/loan.csv'
import pandas as pd
#ppr = runPreprocess(data_path)


chunksize = 10000

for chunk in pd.read_csv(data_path, chunksize=chunksize):
    df = chunk
    break
    
print(df.shape)

(10000, 145)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
import sys
import json
import tensorflow as tf

from pp_runner import runPreprocess


datasetName = sys.argv[1]
datasetPath = sys.argv[2]

if datasetName=='loan':
    ppr = runPreprocess(datasetPath)
    ppr.get_dataset_info()
    datss = ppr.start_preprocessing()
    
    train, validate, test = np.split(datss.sample(frac=1), [int(.6*len(datss)), int(.8*len(datss))])
    
    train.to_csv('data/loan/train.csv', index=False, encoding='utf-8')
    print('Train data saved at data/loan/train.csv')
    validate.to_csv('data/loan/validate.csv', index=False, encoding='utf-8')
    print('validate data saved at data/loan/validate.csv')
    test.to_csv('data/loan/test.csv', index=False, encoding='utf-8')
    print('test data saved at data/loan/test.csv')
    