In [2]:
import json
_preprocessing_file_PATH = 'preprocessing.json'
with open(_preprocessing_file_PATH) as f:
    preprocessing_param = json.load(f)

print(preprocessing_param)

{'_comment': {'_col_to_load': 'columns important to the target', '_train_val_eval': 'train val and evaluation split ratio', '_fill_strategy': 'missing value imputation strategy', '_drop_strategy': 'dropping strategy, 1 indicate axis 1(column)', '_col_to_dummied': 'feature that will be dummied, leave empty to dummy all feature', '_processed_data_PATH': 'path to dump the prosessed data'}, 'col_to_load': [], 'train_val_eval': [], 'fill_strategy': {'Age': 'Median', 'Fare': 'Median', 'Embarked': 'Mode'}, 'drop_strategy': {'Age': 1, 'Name': 1, 'Fare': 1}, 'col_to_dummied': [], 'processed_data_PATH': ''}


In [20]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
import sys
import json
from sklearn.preprocessing import LabelEncoder

class Preprocess():

    def __init__(self):
        print("Preprocess object created")

    def fillna(self, data, fill_strategies):
        """
        Fill missing values for each column using 'Zero', or feature 'Mode', 'Mean', or 'Median'
        :param data: dataset information will be gathered from
        :fill_strategies: strategies of filling missing value e.g. 'Mode', 'Mean', or 'Median' etc.
        :return: the processed dataframe
        """
        for column, strategy in fill_strategies.items():
            if strategy == 'None':
                data[column] = data[column].fillna('None')
            elif strategy == 'Zero':
                data[column] = data[column].fillna(0)
            elif strategy == 'Mode':
                data[column] = data[column].fillna(data[column].mode()[0])
            elif strategy == 'Mean':
                data[column] = data[column].fillna(data[column].mean())
            elif strategy == 'Median':
                data[column] = data[column].fillna(data[column].median())
            else:
                print("{}: There is no such thing as preprocess strategy".format(strategy))

        return data

    def drop(self, data, drop_strategies):
        """
        Dropping the non informative features
        :param data: dataset information will be gathered from
        :fill_strategies: strategies of filling missing value e.g. 'Mode', 'Mean', or 'Median' etc.
        :return: the processed dataframe
        """
        for column, strategy in drop_strategies.items():
            data=data.drop(labels=[column], axis=strategy)

        return data


    def _label_encoder(self,data):
        """
        Encoding the categorical variable
        :param data: dataset information will be gathered from
        :return: the processed dataframe
        """
        
        cols = data.columns
        num_cols = data._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        

        labelEncoder=LabelEncoder()
        for column in cat_cols:
            print(column)
            labelEncoder.fit(data[column])
            data[column]=labelEncoder.transform(data[column])
        return data

    def _get_dummies(self, data, prefered_columns=None):

        if prefered_columns is None:
            columns=data.columns.values
            non_dummies=None
        else:
            non_dummies=[col for col in data.columns.values if col not in prefered_columns]

            columns=prefered_columns


        dummies_data=[pd.get_dummies(data[col],prefix=col) for col in columns]

        if non_dummies is not None:
            for non_dummy in non_dummies:
                dummies_data.append(data[non_dummy])

        return pd.concat(dummies_data, axis=1)

In [36]:
data_path = '/home/akash/Desktop/AmEx/data/titanic.csv'
dataset = pd.read_csv(data_path)
display(dataset)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [37]:
#'Age': 1, 'Name': 1, 'Fare': 1
pp = Preprocess()

newdf = pp.drop(dataset, {'Age': 1, 'Name': 1, 'Fare': 1, 'Cabin':1})
newdf = pp.fillna(newdf, {'Embarked': 'Mode'})
display(newdf)

Preprocess object created


Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Embarked
0,1,0,3,male,1,0,A/5 21171,S
1,2,1,1,female,1,0,PC 17599,C
2,3,1,3,female,0,0,STON/O2. 3101282,S
3,4,1,1,female,1,0,113803,S
4,5,0,3,male,0,0,373450,S
...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,0,211536,S
887,888,1,1,female,0,0,112053,S
888,889,0,3,female,1,2,W./C. 6607,S
889,890,1,1,male,0,0,111369,C


In [38]:
newdf2 = pp._label_encoder(newdf)
display(newdf2)

Sex
Embarked
Ticket


Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Embarked
0,1,0,3,1,1,0,523,2
1,2,1,1,0,1,0,596,0
2,3,1,3,0,0,0,669,2
3,4,1,1,0,1,0,49,2
4,5,0,3,1,0,0,472,2
...,...,...,...,...,...,...,...,...
886,887,0,2,1,0,0,101,2
887,888,1,1,0,0,0,14,2
888,889,0,3,0,1,2,675,2
889,890,1,1,1,0,0,8,0


In [1]:
#!/usr/bin/env python
# coding: utf-8


"""
This script is to run the proprocessing script
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO
import sys
import json

from preprocessing import Information, Preprocess

class runPreprocess():

    def __init__(self, data_input_path):
        self._input_path = data_input_path
        self.data = None
        self.ppc_parameters = None

        self._preprocessor  =Preprocess()
        self._getInfo = Information()


        print("Please Ensure to fill up the 'preprocessing.json' file first!")
        
        _preprocessing_file_PATH = 'preprocessing.json'
        with open(_preprocessing_file_PATH) as f:
            preprocessing_param = json.load(f)

        self.ppc_parameters = preprocessing_param

    def _read_csv_file(self):
        col_to_read = self.ppc_parameters["col_to_load"]
        if len(col_to_read):
            return pd.read_csv(self._input_path, usecols=col_to_read)
        else:
            return pd.read_csv(self._input_path)

    def get_dataset_info(self):
        self.data = self._read_csv_file()
        self._getInfo.info(self.data)

    def start_preprocessing(self):
        self._strategy()
        return self.data
        

    def _strategy(self):
        drop_strategy = self.ppc_parameters["drop_strategy"]
        self.data = self._preprocessor.drop(self.data, drop_strategy)

        fill_strategy = self.ppc_parameters["fill_strategy"]
        self.data = self._preprocessor.fillna(self.data, fill_strategy)

        self.data = self._preprocessor._label_encoder(self.data)

        self.data=self._preprocessor._get_dummies(self.data, prefered_columns=self.ppc_parameters["col_to_dummied"])

'\nclass runPreprocess():\n\n    def __init__(self, data_input_path):\n        self._input_path = data_input_path\n        self.data = None\n        self.ppc_parameters = None\n\n        self._preprocessor  =Preprocess()\n        self._getInfo = Information()\n\n\n        print("Please Ensure to fill up the \'preprocessing.json\' file first!")\n        \n        _preprocessing_file_PATH = \'preprocessing.json\'\n        with open(_preprocessing_file_PATH) as f:\n            preprocessing_param = json.load(f)\n\n        self.ppc_parameters = preprocessing_param\n\n    def _read_csv_file(self):\n        col_to_read = self.ppc_parameters["col_to_load"]\n        if len(col_to_read):\n            return pd.read_csv(self._input_path, usecols=col_to_read)\n        else:\n            return pd.read_csv(self._input_path)\n\n    def get_dataset_info(self):\n        self.data = self._read_csv_file()\n        self._getInfo.info(self.data)\n\n    def start_preprocessing(self):\n        self._strate

In [22]:
data_path = '/home/akash/Desktop/AmEx/data/titanic.csv'

ppr = runPreprocess(data_path)

Preprocess object created
Information object created
Please Ensure to fill up the 'preprocessing.json' file first!


In [23]:
ppr.get_dataset_info()

No of rows:  891
No of columns:  12
Memory Used:  329748
FEATURE NAME     DATA FORMAT      # OF MISSING VALUES       10 SAMPLES      
Cabin              object              687                 nan,C85,nan,C123,nan,nan,E46,nan,nan,nan,
Age                float64             177                 22.0,38.0,26.0,35.0,35.0,nan,54.0,2.0,27.0,14.0,
Embarked           object              2                   S,C,S,S,S,Q,S,S,S,C,
Fare               float64             0                   7.25,71.2833,7.925,53.1,8.05,8.4583,51.8625,21.075,11.1333,30.0708,
Ticket             object              0                   A/5 21171,PC 17599,STON/O2. 3101282,113803,373450,330877,17463,349909,347742,237736,
Parch              int64               0                   0,0,0,0,0,0,0,1,2,0,
SibSp              int64               0                   1,1,0,1,0,0,0,3,0,1,
Sex                object              0                   male,female,female,female,male,male,male,male,female,female,
Name               object 

In [24]:
datss = ppr.start_preprocessing()

Ticket
Sex
Embarked


KeyError: 'Title'