In [1]:
#### Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from time import time
import matplotlib.pyplot as plt
from operator import itemgetter

In [2]:
# Load training and test data into pandas dataframes
# ACCCESSING DATA FOLDER REFER THIS LINK 
# (https://stackoverflow.com/questions/7165749/open-file-in-a-relative-location-in-python)

import os

#TO ACCESS THE PATH IN WHICH CURRENT SCRIPT IS RUNNING
fileDir = os.path.dirname(os.path.realpath('__file__'))

#For accessing the file inside a sibling folder.
#data_type = 'raw' or 'processed'
#country   = 'A' or 'B' or 'C'
#person_type = 'hhold' or 'indiv'
#dataset = 'test' or 'train'

def return_file_name(data_type = 'raw', country = 'A', person_type = 'hhold', dataset = 'train'):
    path = '../data/' + data_type + '/' + country + '/' + country + '_' + person_type + '_' + dataset +'.csv'
    print("Building path for file : (" + path +")")
    filename = os.path.join(fileDir, path)
    filename = os.path.abspath(os.path.realpath(filename))
    #print(filename + "\n")
    return filename

def return_train_test(data_type = 'raw', country = 'A', person_type = 'hhold'):
    train = pd.read_csv(return_file_name(data_type = data_type, country = country, person_type = person_type, dataset = 'train'))
    test = pd.read_csv(return_file_name(data_type = data_type, country = country, person_type = person_type, dataset = 'test'))
    # merge training and test sets into one dataframe
    full = pd.concat([train, test])
    return train,test,full

In [98]:
# Get size of dataframes
train_set,test_set,full_set = return_train_test(data_type = 'raw', country = 'A', person_type = 'indiv')

#shape[0] -> # of rows
#shape[1] -> # of cols
print("\nTrain Shape : " + str(train_set.shape))
print("Test Shape : " + str(test_set.shape))
print("Full Shape : " + str(full_set.shape) + "\n")

Building path for file : (../data/raw/A/A_indiv_train.csv)
Building path for file : (../data/raw/A/A_indiv_test.csv)

Train Shape : (37560, 44)
Test Shape : (18535, 43)
Full Shape : (56095, 44)



In [99]:
#Looking for Nans
#return a formatted percentage from a fraction
def percentage(numerator, denomenator):
    
    if type(numerator) == pd.core.series.Series:
        return (numerator/denomenator*100)
    
    elif type(numerator) == int or type(numerator) == float:
        return '{:.1f}%'.format(float(numerator)/float(denomenator)*100) 
    
    else:
        print("check type")

In [100]:
#Get percentage by variable of values which are not NaN
def return_bad_cols(dataframe):
    fill_precent_counts = percentage(dataframe.count()-1, dataframe.shape[0]-1)
    bad_cols = fill_precent_counts[fill_precent_counts < 100]
    return bad_cols

train_bad_cols = return_bad_cols(train_set)
test_bad_cols = return_bad_cols(test_set)
test_bad_cols

OdXpbPGJ    83.473616
dtype: float64

In [101]:
#make directory for asving bad columns statistics
# refer link (https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist)
stats_folder_path = '../data/stats/'

def make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


def create_stats_folder():
    for name in ['A', 'B', 'C']:
        foldername = os.path.join(fileDir, stats_folder_path + name + '/individual')
        foldername = os.path.abspath(os.path.realpath(foldername))
        make_dir(foldername)
        foldername = os.path.join(fileDir, stats_folder_path + name + '/household')
        foldername = os.path.abspath(os.path.realpath(foldername))
        make_dir(foldername)
        print("Trying to create folder for " + name)
        
def return_bad_col_dtype(bad_columns, dataframe):
    data_type = []
    for name in bad_columns:
        data_type.append(dataframe[name].dtype)
    return data_type

create_stats_folder()



Trying to create folder for A
Trying to create folder for B
Trying to create folder for C


In [102]:
def return_unique_values(bad_columns, dataframe):
    unique_vals = []
    for name in bad_columns:
        vals = dataframe[dataframe[name].notnull()][name].unique()
        unique_vals.append(vals.tolist())
    return unique_vals

def return_max_val_repeated(bad_columns, dataframe):
    freq_count_num = []
    val_colon_freq_precent = []
    for name in bad_columns:
        #refer (https://www.w3resource.com/python-exercises/numpy/python-numpy-exercise-94.php)
        unique_elements, counts_elements = np.unique(dataframe[dataframe[name].notnull()][name], return_counts=True)
        val_with_max_freq = unique_elements[np.argmax(counts_elements)]
        max_freq = int(np.max(counts_elements) / np.sum(counts_elements) * 100)
        dict_f = {val_with_max_freq : max_freq}
    
        dict_temp = {}
        for val, count in zip(unique_elements, counts_elements):
            dict_temp[val] = int(count / np.sum(counts_elements) * 100)
        
        freq_count_num.append(dict_f)
        val_colon_freq_precent.append(dict_temp)
    return freq_count_num, val_colon_freq_precent



In [103]:
#############################
##    data set analysis    ##
#############################

def data_set_analysis(bad_columns, dataset):
    #make a data frame 
    #refer (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
    bad_col_data = {'name': bad_columns.keys(), 'per_non_nan': bad_columns.values}
    bad_col_frame = pd.DataFrame(data=bad_col_data)
    
    #Add data type col
    bad_col_data_type = return_bad_col_dtype(bad_col_frame['name'], dataset)
    bad_col_frame['datatype'] = bad_col_data_type
    
    #unique vals col (not used)
    bad_col_unique_vals = return_unique_values(bad_col_frame['name'], dataset)
    #bad_col_frame['unique_vals'] = bad_col_unique_vals

    #show max freq count 
    bad_col_freq_count, temp = return_max_val_repeated(bad_col_frame['name'], dataset)
    bad_col_frame['freq_count'] = bad_col_freq_count
    bad_col_frame['unique_vals_colon_precent'] = temp

    return bad_col_frame
    
bad_frame_train = data_set_analysis(train_bad_cols, train_set)
bad_frame_test = data_set_analysis(test_bad_cols, test_set)
#make a data frame 
'''
#refer (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
train_bad_col_data = {'name': train_bad_cols.keys(), 'per_non_nan': train_bad_cols.values}
train_bad_col_frame = pd.DataFrame(data=train_bad_col_data)

#Add data type col
bad_col_data_type = return_bad_col_dtype(train_bad_col_frame['name'], train_set)
train_bad_col_frame['datatype'] = bad_col_data_type

#unique vals col (not used)
bad_col_unique_vals = return_unique_values(train_bad_col_frame['name'], train_set)
#train_bad_col_frame['unique_vals'] = bad_col_unique_vals

#show max freq count 
bad_col_freq_count, temp = return_max_val_repeated(train_bad_col_frame['name'], train_set)
train_bad_col_frame['freq_count'] = bad_col_freq_count
train_bad_col_frame['unique_vals_colon_precent'] = temp
'''

#train_set[train_set.FGWqGkmD.isnull()].groupby('poor')['poor'].count()
#train[train.FGWqGkmD.notnull()].groupby('poor')['poor'].count()

#for col_name, precent in zip(train_bad_cols.keys(),train_bad_cols.values):
    #print(col_name + " " + str(precent))

"\n#refer (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)\ntrain_bad_col_data = {'name': train_bad_cols.keys(), 'per_non_nan': train_bad_cols.values}\ntrain_bad_col_frame = pd.DataFrame(data=train_bad_col_data)\n\n#Add data type col\nbad_col_data_type = return_bad_col_dtype(train_bad_col_frame['name'], train_set)\ntrain_bad_col_frame['datatype'] = bad_col_data_type\n\n#unique vals col (not used)\nbad_col_unique_vals = return_unique_values(train_bad_col_frame['name'], train_set)\n#train_bad_col_frame['unique_vals'] = bad_col_unique_vals\n\n#show max freq count \nbad_col_freq_count, temp = return_max_val_repeated(train_bad_col_frame['name'], train_set)\ntrain_bad_col_frame['freq_count'] = bad_col_freq_count\ntrain_bad_col_frame['unique_vals_colon_precent'] = temp\n"

In [104]:
#country = 'A'
country = 'B'
#country = 'C'

type_people = 'household'
#type_people = 'individual'

stats_name_train = os.path.join(fileDir, stats_folder_path + country + '/'+ type_people + '/' + country +'_' + type_people + '_train.csv')
stats_name_train = os.path.abspath(os.path.realpath(stats_name_train))

stats_name_test = os.path.join(fileDir, stats_folder_path + country + '/'+ type_people + '/' + country +'_' + type_people + '_test.csv')
stats_name_test = os.path.abspath(os.path.realpath(stats_name_test))


bad_frame_train.to_csv(stats_name_train, index=False)

bad_frame_test.to_csv(stats_name_test, index=False)

In [105]:
train_bad_cols

OdXpbPGJ    83.31159
dtype: float64

In [106]:
test_bad_cols

OdXpbPGJ    83.473616
dtype: float64

In [107]:
col_float_64 = []
for i  in train_set.columns:
    if (train_set[i].dtype == 'float64'):
        col_float_64.append(i)
        #print(train_set[i].dtype)

len(col_float_64)

1

In [108]:
same_cols = []
for x,y in zip(col_float_64, train_bad_cols.keys()):
    if (x == y):
        same_cols.append(x)
print(same_cols)
len(same_cols)

['OdXpbPGJ']


1

In [109]:
a = train_set.describe()
names = ['count','mean', 'std', 'min', '25%', '50%', '75%', 'max']
    
a['name'] = names
a.to_csv("a_indiv_train.csv", index=False)


b = test_set.describe()
names = ['count','mean', 'std', 'min', '25%', '50%', '75%', 'max']
    
b['name'] = names
b.to_csv("a_indiv_test.csv", index=False)

In [47]:
a

Unnamed: 0,id,wJthinfa,ZvEApWrk,vuQrLzvK,FGWqGkmD,qrOrXLPM,BXOWgPgL,umkFMfvA,McFBIGsm,NjDdhqIe,...,IrxBnWxE,BRzuVmyf,dnlnKrAg,VyHofjLM,GrLBZowF,oszSdLhD,aAufyreG,cDhZjxaW,OSmfjCbE,IOMvIGQS
count,3255.0,3255.0,3255.0,3255.0,602.0,3255.0,2504.0,890.0,2504.0,3255.0,...,272.0,1794.0,532.0,3255.0,3255.0,3255.0,909.0,3255.0,2504.0,3255.0
mean,50057.635023,43.38126,96.04086,17.427343,-7.509967,22.203379,158.354633,-33.279775,301.10623,88.597849,...,0.647059,45.675585,-15965.135338,1.974808,-249.528111,0.670661,45.782178,-85.93702,-339.568291,78.568356
std,28615.901302,22.728441,105.556895,72.057949,9.499141,6.962658,124.535287,8.231694,155.904844,107.268927,...,9.09769,41.675286,39.715899,1.565015,322.468103,1.833827,49.499821,114.537914,147.833796,63.123421
min,17.0,-126.0,-2.0,-125.0,-53.0,8.0,-40.0,-63.0,-43.0,-7.0,...,-61.0,9.0,-16047.0,-2.0,-5044.0,-23.0,-6.0,-3639.0,-506.0,0.0
25%,25938.0,26.0,33.0,-39.0,-13.0,16.0,50.0,-36.0,185.0,28.0,...,3.0,21.0,-15999.0,2.0,-364.0,1.0,12.0,-119.0,-501.0,50.0
50%,50299.0,42.0,68.0,27.0,-8.0,24.0,150.0,-36.0,305.0,63.0,...,3.0,36.0,-15959.0,2.0,-184.0,1.0,39.0,-59.0,-356.0,50.0
75%,74848.0,58.0,138.0,77.0,2.0,24.0,250.0,-27.0,425.0,98.0,...,3.0,51.0,-15927.0,2.0,-64.0,1.0,48.0,-39.0,-256.0,100.0
max,99979.0,122.0,1069.0,127.0,2.0,48.0,500.0,-18.0,605.0,1253.0,...,3.0,276.0,-15911.0,8.0,-4.0,1.0,426.0,1.0,34.0,900.0


Index(['FGWqGkmD', 'BXOWgPgL', 'umkFMfvA', 'McFBIGsm', 'IrxBnWxE', 'BRzuVmyf',
       'dnlnKrAg', 'aAufyreG', 'OSmfjCbE'],
      dtype='object')

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')