###  Classification Targets

Target - the target is an ordinal variable indicating groups of income levels.

1 = extreme poverty \
2 = moderate poverty \
3 = vulnerable households \
4 = non vulnerable households


https://www.kaggle.com/competitions/costa-rican-household-poverty-prediction/data

### Setup

In [14]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
DATA_DIR = '../data/'

TEST_CSV = DATA_DIR + 'test.csv'
TRAIN_CSV = DATA_DIR + 'train.csv'
TEST_CSV = DATA_DIR + 'test.csv'

TARGET_COLUMN = "Target"
ID_COLUMN = "Id"
HOUSE_HOLD_ID_COLUMN = "idhogar"

DEFAULT_RANDOM_STATE = 369
DEFAULT_TEST_SIZE = 0.3
DEFAULT_VALIDATION_SIZE = 0.3
DEFAULT_CROSS_VALIDATION = 4

In [182]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from copy import deepcopy

pipeline = None

def fill_and_encode(data, fit=False):
    global pipeline
    num_cols = data.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = data.select_dtypes(include=['object', 'bool']).columns  

    num_transformer = SimpleImputer(strategy='median') 

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols),
        ])

    if pipeline is None or fit:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
        fit_data = pipeline.fit_transform(data)
    else:
        fit_data = pipeline.transform(data)

    feature_names = num_cols
    return pd.DataFrame(fit_data, columns = feature_names)

def map_string_to_int(data):
    mapping = {"yes": 1, "no": 0}
    # For the following fields the values 0,1 are represented as yes,no. odd!
    # edjefe,years of education of male head of household
    # edjefa,years of education of female head of household
    # dependency, Dependency rate
    for col in ["edjefe", "edjefa", 'dependency']:
        data[col] = data[col].apply(lambda x: mapping[x] if x in mapping else x).astype(float)
    return data

def remove_columns(data):
    AGE_SQUARE = 'SQBage'
    cols_to_remove = [ ID_COLUMN, HOUSE_HOLD_ID_COLUMN, AGE_SQUARE ]
    for col in cols_to_remove:
        if col in data.columns:
            data = data.drop(columns = col)
    return data

def prepare(csv_path = None, data = None):
    if csv_path is None and data is None:
        raise ValueError("Either csv_path or data must be provided")
    if csv_path is not None and data is not None:
        raise ValueError("Only one of csv_path or data must be provided")
    raw_data = pd.read_csv(csv_path) if csv_path is not None else deepcopy(data)
    raw_data = remove_columns(raw_data)
    raw_data = map_string_to_int(raw_data)
    return fill_and_encode(raw_data)

In [183]:
train_data = pd.read_csv(TRAIN_CSV)
processed_train_data = prepare(csv_path=TRAIN_CSV)

| column | description | values | 
| --- | --- | --- |
| bedrooms | number of bedrooms | 1-8 |
| overcrowding | # persons per room | 0.2 - 6 |
| meaneduc | meaneduc,average years of education for adults (18+) | 0-37 |



In [96]:
sample = train_data.groupby(TARGET_COLUMN).apply(lambda x: x.sample(20))
sample[sample['dependency'] > 1][['overcrowding','meaneduc' ,'rooms', 'bedrooms', 'hogar_total' ,'edjefa','edjefe','dependency','SQBdependency', 'Target']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,overcrowding,meaneduc,rooms,bedrooms,hogar_total,edjefa,edjefe,dependency,SQBdependency,Target
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3394,1.666667,6.0,6,3,5,0.0,3.0,1.5,2.25,1
1,6213,4.0,3.0,4,2,7,0.0,3.0,2.5,6.25,1
1,3004,1.5,9.0,8,4,6,0.0,11.0,2.0,4.0,1
1,4491,2.5,5.0,4,2,5,0.0,1.0,1.5,2.25,1
1,8441,2.0,7.5,4,3,6,0.0,6.0,2.0,4.0,1


In [43]:
# edjefe,years of education of male head of household
# edjefa,years of education of female head of household

processed_train_data['dependency']


0       0.00
1       8.00
2       8.00
3       1.00
4       1.00
        ... 
9552    0.25
9553    0.25
9554    0.25
9555    0.25
9556    0.25
Name: dependency, Length: 9557, dtype: float64

In [51]:
processed_train_data.groupby([TARGET_COLUMN, 'tamviv']).size().unstack()

tamviv,1,2,3,4,5,6,7,8,9,10,11,12,13,15
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,18.0,94.0,154.0,161.0,139.0,73.0,45.0,20.0,18.0,20.0,,,13.0,
2,46.0,169.0,293.0,337.0,343.0,227.0,75.0,28.0,47.0,4.0,15.0,,13.0,
3,52.0,122.0,232.0,300.0,237.0,125.0,79.0,20.0,6.0,,11.0,12.0,4.0,9.0
4,239.0,935.0,1527.0,1602.0,908.0,443.0,185.0,43.0,27.0,46.0,29.0,12.0,,


In [54]:
processed_train_data.groupby([TARGET_COLUMN, 'paredblolad']).size().unstack()

paredblolad,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,460,295
2,988,609
3,589,620
4,1843,4153


In [55]:
processed_train_data.groupby([TARGET_COLUMN, 'paredzocalo']).size().unstack()

paredzocalo,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,674,81
2,1416,181
3,1091,118
4,5633,363


In [56]:
processed_train_data.groupby([TARGET_COLUMN, 'paredpreb']).size().unstack()

paredpreb,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,568,187
2,1205,392
3,927,282
4,5060,936


In [63]:
processed_train_data.groupby([TARGET_COLUMN, 'pisomoscer']).size().unstack()

pisomoscer,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,403,352
2,808,789
3,471,738
4,1254,4742


In [66]:
processed_train_data.groupby([TARGET_COLUMN, 'pisoother']).size().unstack()

pisoother,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,755.0,
2,1597.0,
3,1209.0,
4,5987.0,9.0


In [72]:
processed_train_data.groupby([TARGET_COLUMN, 'techozinc']).size().unstack()

techozinc,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,29,726
2,74,1523
3,10,1199
4,173,5823


In [75]:
processed_train_data.groupby([TARGET_COLUMN, 'techootro']).size().unstack()

techootro,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,755.0,
2,1597.0,
3,1209.0,
4,5975.0,21.0


In [95]:
processed_train_data.groupby([TARGET_COLUMN, 'epared3']).size().unstack()

epared3,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,538,217
2,1019,578
3,611,598
4,1940,4056


In [99]:
processed_train_data.groupby([TARGET_COLUMN, 'eviv1']).size().unstack()

eviv1,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,603,152
2,1258,339
3,1064,145
4,5666,330


In [224]:
processed_train_data.groupby([TARGET_COLUMN, 'eviv3']).size().unstack()

eviv3,0.0,1.0
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,454,301
2.0,908,689
3.0,539,670
4.0,1481,4515


In [102]:
processed_train_data.groupby([TARGET_COLUMN, 'eviv3']).size().unstack()

eviv3,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,454,301
2,908,689
3,539,670
4,1481,4515


In [103]:
processed_train_data.groupby([TARGET_COLUMN, 'dis']).size().unstack()

dis,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,702,53
2,1462,135
3,1132,77
4,5711,285


In [122]:
processed_train_data.groupby([TARGET_COLUMN, 'hogar_nin']).size().unstack()

hogar_nin,0,1,2,3,4,5,6,7,8,9
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,135.0,153.0,168.0,140.0,77.0,43.0,7.0,,19.0,13.0
2,306.0,311.0,402.0,348.0,144.0,33.0,44.0,9.0,,
3,248.0,321.0,342.0,218.0,49.0,19.0,12.0,,,
4,2308.0,1744.0,1366.0,404.0,125.0,40.0,,9.0,,


In [218]:
processed_train_data['educated_adults'] = processed_train_data['hogar_adul'] * (processed_train_data['meaneduc']) 

In [219]:
processed_train_data.groupby([TARGET_COLUMN, 'educated_adults']).size().unstack()

educated_adults,0.000000,1.000000,1.000000,2.000000,2.000000,3.000000,4.000000,4.000000,5.000000,6.000000,7.000000,7.000000,8.000000,8.000000,9.000000,10.000000,10.000000,11.000000,11.000000,12.000000,13.000000,13.000001,13.999999,14.000000,15.000000,16.000000,16.000000,17.000000,17.000000,17.999999,18.000000,19.000000,19.000000,19.999999,20.000000,20.999999,21.000000,22.000000,22.000001,22.999999,23.000000,23.000000,24.000000,24.999999,25.000000,25.999999,26.000000,26.000001,27.000000,27.000000,27.999999,28.000000,28.000000,29.000000,29.000001,30.000000,30.999999,31.000000,32.000000,32.000000,32.000001,33.000000,33.999999,34.000000,35.000000,35.000001,35.999999,36.000000,36.999999,37.000000,37.000000,38.000000,38.000000,38.000001,39.000000,39.000001,39.000002,39.999999,40.000000,40.999999,41.000000,41.000001,41.999998,42.000000,42.999999,43.000000,43.000002,43.999999,44.000000,44.000001,44.000001,45.000000,45.999999,45.999999,46.000000,46.999998,47.000000,47.000001,48.000000,48.000002,49.000000,49.000001,49.000002,50.000000,51.000000,52.000000,52.000002,52.999998,53.000000,54.000000,55.000000,56.000000,57.000000,58.000000,59.000000,60.999999,61.000000,61.999997,62.000000,63.000000,64.000000,66.000000,66.999998,67.000002,68.000000,69.000000,69.999999,70.000002,71.000000,75.000000,78.000000,80.000000,84.000000,87.000000
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1
1.0,17.0,,,6.0,4.0,16.0,32.0,3.0,22.0,98.0,11.0,24.0,39.0,,24.0,1.0,13.0,28.0,4.0,48.0,19.0,9.0,1.0,19.0,23.0,32.0,5.0,,36.0,,15.0,20.0,7.0,7.0,17.0,,5.0,17.0,,7.0,,2.0,5.0,14.0,7.0,,,6.0,1.0,,16.0,,,2.0,,5.0,3.0,8.0,3.0,,3.0,,10.0,6.0,,9.0,,11.0,,,,,4.0,,,,,,,,7.0,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2.0,28.0,13.0,,12.0,3.0,16.0,13.0,9.0,22.0,116.0,4.0,36.0,46.0,3.0,74.0,3.0,36.0,71.0,3.0,130.0,55.0,9.0,5.0,34.0,99.0,47.0,,22.0,59.0,,79.0,28.0,9.0,23.0,35.0,6.0,46.0,40.0,8.0,9.0,7.0,39.0,30.0,4.0,17.0,11.0,12.0,8.0,12.0,9.0,17.0,,7.0,13.0,5.0,25.0,5.0,8.0,12.0,,5.0,11.0,,6.0,,4.0,,14.0,7.0,,,,,,,,,,4.0,,7.0,,,5.0,,,6.0,,5.0,,,10.0,5.0,,,,,,,,,7.0,,,,,,,,,,,,,,,5.0,,,,,,,4.0,,,,,,,,,,
3.0,13.0,4.0,3.0,14.0,,12.0,9.0,,7.0,56.0,,19.0,24.0,3.0,43.0,3.0,18.0,52.0,3.0,136.0,20.0,11.0,7.0,42.0,42.0,18.0,,4.0,35.0,7.0,37.0,39.0,13.0,5.0,35.0,,16.0,39.0,23.0,11.0,,24.0,11.0,31.0,18.0,,15.0,13.0,18.0,,16.0,,8.0,7.0,8.0,18.0,5.0,8.0,14.0,6.0,,13.0,15.0,5.0,12.0,,,13.0,,6.0,7.0,7.0,,4.0,7.0,,,,6.0,,,,,4.0,,,,,,,,9.0,,,5.0,,5.0,,,,,,6.0,5.0,1.0,,,,,5.0,,6.0,,,,,,,,,,,,7.0,,,,6.0,,,,,12.0,
4.0,18.0,19.0,,7.0,3.0,18.0,19.0,,15.0,125.0,6.0,48.0,70.0,6.0,68.0,16.0,40.0,116.0,6.0,230.0,91.0,16.0,11.0,113.0,147.0,108.0,33.0,21.0,228.0,,161.0,93.0,32.0,46.0,100.0,,161.0,209.0,31.0,55.0,,112.0,151.0,55.0,117.0,10.0,77.0,51.0,188.0,,90.0,5.0,154.0,108.0,70.0,196.0,56.0,114.0,98.0,,22.0,101.0,56.0,101.0,43.0,38.0,6.0,97.0,30.0,78.0,,17.0,77.0,28.0,91.0,5.0,7.0,37.0,47.0,17.0,31.0,27.0,11.0,74.0,51.0,46.0,12.0,11.0,21.0,5.0,45.0,48.0,13.0,7.0,28.0,5.0,17.0,22.0,48.0,5.0,26.0,,3.0,24.0,42.0,40.0,8.0,4.0,32.0,20.0,29.0,24.0,15.0,15.0,8.0,11.0,4.0,9.0,8.0,27.0,5.0,11.0,7.0,,12.0,5.0,10.0,,6.0,7.0,6.0,5.0,,10.0


In [123]:
processed_train_data.groupby([TARGET_COLUMN, 'hogar_adul']).size().unstack()

hogar_adul,0,1,2,3,4,5,6,7,8,9
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,197.0,356.0,134.0,49.0,,19.0,,,
2,,257.0,832.0,278.0,170.0,56.0,4.0,,,
3,,127.0,594.0,257.0,138.0,52.0,41.0,,,
4,5.0,496.0,2680.0,1530.0,890.0,259.0,43.0,52.0,20.0,21.0


In [124]:
processed_train_data.groupby([TARGET_COLUMN, 'hogar_mayor']).size().unstack()

hogar_mayor,0,1,2,3
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,627.0,81.0,38.0,9.0
2,1218.0,250.0,122.0,7.0
3,932.0,212.0,65.0,
4,4752.0,843.0,372.0,29.0


In [129]:
processed_train_data.groupby([TARGET_COLUMN, 'meaneduc']).size().unstack()

meaneduc,0.000000,0.333333,0.500000,0.666667,1.000000,1.333333,1.500000,2.000000,2.333333,2.500000,...,22.000000,23.000000,23.500000,24.500000,25.000000,27.000000,28.000000,29.000000,32.000000,37.000000
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,17.0,,,4.0,9.0,3.0,4.0,28.0,11.0,7.0,...,5.0,,,,,,,,,
2,28.0,,9.0,3.0,12.0,9.0,5.0,17.0,4.0,17.0,...,7.0,3.0,,,,,,,,
3,13.0,3.0,,,10.0,,4.0,13.0,,4.0,...,3.0,,,,,,,,,
4,13.0,,10.0,3.0,11.0,,7.0,20.0,6.0,12.0,...,2.0,10.0,3.0,5.0,11.0,2.0,5.0,7.0,2.0,3.0


In [130]:
processed_train_data.groupby([TARGET_COLUMN, 'instlevel1']).size().unstack()

instlevel1,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
1,570,185
2,1282,315
3,1009,200
4,5409,587


In [141]:
processed_train_data.groupby([TARGET_COLUMN, 'bedrooms']).size().unstack()

bedrooms,1,2,3,4,5,6,8
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,78.0,345.0,292.0,37.0,3.0,,
2,143.0,713.0,560.0,129.0,37.0,15.0,
3,93.0,445.0,487.0,116.0,68.0,,
4,288.0,1880.0,2646.0,902.0,191.0,85.0,4.0


In [145]:
pd.set_option('display.max_columns', None)

processed_train_data.groupby([TARGET_COLUMN, 'overcrowding']).size().unstack()

overcrowding,0.200000,0.250000,0.333333,0.400000,0.500000,0.600000,0.666667,0.750000,0.800000,0.833333,1.000000,1.166667,1.200000,1.250000,1.333333,1.400000,1.500000,1.600000,1.666667,1.750000,1.800000,1.833333,2.000000,2.200000,2.250000,2.333333,2.500000,2.666667,3.000000,3.333333,3.500000,3.666667,4.000000,4.333334,4.500000,5.000000,5.500000,6.000000
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1,,,3.0,,9.0,,24.0,7.0,,,121.0,,,7.0,50.0,,80.0,3.0,68.0,3.0,,,134.0,,,14.0,54.0,,70.0,20.0,28.0,,31.0,13.0,,10.0,,6.0
2,,3.0,7.0,,37.0,,43.0,5.0,4.0,5.0,210.0,,,20.0,112.0,13.0,207.0,,145.0,14.0,9.0,,305.0,4.0,,41.0,146.0,8.0,151.0,4.0,7.0,,26.0,13.0,9.0,20.0,11.0,18.0
3,,1.0,10.0,,26.0,,35.0,15.0,1.0,,203.0,,6.0,35.0,141.0,26.0,132.0,8.0,105.0,14.0,,,183.0,7.0,,32.0,72.0,12.0,86.0,,7.0,,27.0,4.0,,14.0,4.0,3.0
4,1.0,11.0,41.0,15.0,177.0,26.0,289.0,118.0,43.0,11.0,1499.0,2.0,30.0,253.0,893.0,21.0,740.0,3.0,469.0,94.0,,18.0,779.0,,27.0,68.0,136.0,8.0,141.0,3.0,,11.0,30.0,,,30.0,,9.0


In [190]:
processed_train_data.groupby([TARGET_COLUMN, 'tamhog']).size().unstack()

tamhog,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,20.0,101.0,164.0,165.0,130.0,67.0,49.0,8.0,18.0,20.0,,,13.0
2.0,49.0,174.0,303.0,367.0,344.0,212.0,65.0,32.0,36.0,,15.0,,
3.0,57.0,120.0,231.0,315.0,241.0,126.0,75.0,16.0,9.0,,7.0,12.0,
4.0,260.0,967.0,1560.0,1604.0,875.0,414.0,175.0,40.0,27.0,40.0,22.0,12.0,


In [191]:
processed_train_data.groupby([TARGET_COLUMN, 'hhsize']).size().unstack()

hhsize,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,20.0,101.0,164.0,165.0,130.0,67.0,49.0,8.0,18.0,20.0,,,13.0
2.0,49.0,174.0,303.0,367.0,344.0,212.0,65.0,32.0,36.0,,15.0,,
3.0,57.0,120.0,231.0,315.0,241.0,126.0,75.0,16.0,9.0,,7.0,12.0,
4.0,260.0,967.0,1560.0,1604.0,875.0,414.0,175.0,40.0,27.0,40.0,22.0,12.0,


In [238]:

processed_train_data['housing_score'] = (processed_train_data['epared3']*3 + processed_train_data['etecho3']*3 + processed_train_data['eviv3']*3 + processed_train_data['tipovivi1'] * 3)
processed_train_data['housing_score'] += (processed_train_data['etecho2'] + processed_train_data['epared2'] + processed_train_data['eviv2'])
processed_train_data['housing_score'] += (processed_train_data['etecho1']/2 + processed_train_data['epared1']/2 + processed_train_data['eviv1']/2 )


processed_train_data['housing_score'] += processed_train_data['tipovivi1'] * 3

# processed_train_data['housing_score'] = 
# processed_train_data['housing_score'] += (processed_train_data['etecho2'] + processed_train_data['epared2'] + processed_train_data['eviv2']) * 1.5


In [239]:
processed_train_data.groupby([TARGET_COLUMN, 'housing_score']).size().unstack()

housing_score,1.5,2.0,2.5,3.0,4.0,4.5,5.0,6.5,7.0,9.0
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,87,50,72,176,26,46,54,7,84,153
2.0,194,60,82,363,19,106,153,42,159,419
3.0,81,42,61,233,6,41,126,33,149,437
4.0,205,80,112,715,58,110,459,77,672,3508


In [214]:
import numpy as np

processed_train_data['adult_ratio'] = np.where((processed_train_data['hogar_nin'] + processed_train_data['hogar_mayor']) > 0, 
                                                        (processed_train_data['hogar_adul']/ (processed_train_data['hogar_total'])), 
                                                        processed_train_data['hogar_adul'])

processed_train_data['adult_ratio'] += processed_train_data['meaneduc'] 
processed_train_data['adult_ratio'] = np.sqrt(processed_train_data['adult_ratio'])

In [215]:
processed_train_data.groupby([TARGET_COLUMN, 'adult_ratio']).size().unstack()

adult_ratio,0.500000,0.577350,0.632456,0.707107,0.816497,0.912871,0.948683,1.000000,1.154701,1.190238,1.224745,1.290994,1.290994,1.414214,1.471960,1.511858,1.527525,1.527525,1.549193,1.581139,1.632993,1.658312,1.712698,1.732051,1.755942,1.779513,1.812654,1.825742,1.825742,1.843909,1.870829,1.884144,1.914854,1.914854,1.936492,1.974842,2.000000,2.020726,2.027588,2.041241,2.073644,2.075498,2.077086,2.081666,2.081666,2.097618,2.101587,2.121320,2.144761,2.152518,2.160247,2.160247,2.179449,2.198484,2.213594,2.217356,2.221111,2.236068,2.243509,2.247221,2.251983,2.254625,2.273030,2.291288,2.305273,2.309401,2.309401,2.323790,2.327373,2.327373,2.345208,2.355844,2.366432,2.371708,2.377929,2.380476,2.380476,2.387467,2.397916,2.408319,2.412764,2.415229,2.428992,2.435843,2.440401,2.449490,2.459675,2.466441,2.483277,2.483277,2.489980,2.500000,2.503331,2.507133,2.509980,2.516611,2.516611,2.516612,2.529822,2.533114,2.533114,2.549510,2.549510,2.559297,2.563480,2.563480,2.569046,2.569047,2.581989,2.581989,2.581989,2.587362,2.588436,2.598076,2.607681,2.614065,2.614065,2.626785,2.630984,2.633122,2.645751,2.661453,2.663689,2.677063,2.677063,2.689486,2.692582,2.695676,2.705814,2.708013,2.708013,2.712405,2.720294,2.723356,2.738613,2.747726,2.756077,2.756810,2.768875,2.768875,2.768875,2.778889,2.783882,2.786020,2.790289,2.792848,2.796682,2.798809,2.798809,2.810694,2.813657,2.816617,2.828427,2.834314,2.837252,2.841026,2.843120,2.848559,2.857738,2.857738,2.863564,2.872281,2.875181,2.880972,2.883450,2.886751,2.886751,2.890872,2.898275,2.901149,2.901149,2.915476,2.924038,2.932576,2.932576,2.943920,2.943920,2.943920,2.945807,2.958040,2.960051,2.964071,2.966479,2.966479,2.972092,2.972092,2.979094,2.983287,2.988868,3.000000,3.005550,3.008322,3.013857,3.027650,3.027650,3.038640,3.041381,3.044120,3.049590,3.051932,3.055050,3.055050,3.055050,3.065942,3.068659,3.068659,3.071373,3.082207,3.090307,3.093773,3.098387,3.104017,3.109126,3.109126,3.109126,3.109126,3.113590,3.115820,3.122499,3.124405,3.130495,3.135815,3.135815,3.142451,3.146427,3.148696,3.149074,3.151719,3.162278,3.170173,3.175426,3.188521,3.188521,3.201562,3.204164,3.207135,3.209361,3.212698,3.214550,3.214550,3.214550,3.224903,3.227486,3.227486,3.240370,3.242574,3.248076,3.251373,3.255764,3.255764,3.265986,3.265986,3.273268,3.278719,3.289594,3.301515,3.304038,3.306559,3.316625,3.321646,3.324154,3.329164,3.333809,3.338092,3.341656,3.346640,3.354102,3.356586,3.361547,3.363671,3.366502,3.366502,3.376389,3.378856,3.391165,3.393271,3.398529,3.401680,3.405877,3.415650,3.415650,3.422298,3.427827,3.429563,3.433033,3.435113,3.439961,3.449638,3.454466,3.464102,3.468910,3.471311,3.476109,3.488075,3.497618,3.500000,3.502380,3.507136,3.511885,3.511885,3.521363,3.523729,3.525418,3.535534,3.542598,3.549648,3.549648,3.559026,3.559026,3.566822,3.570714,3.577709,3.580702,3.582364,3.591657,3.596294,3.605551,3.612478,3.617089,3.628590,3.633180,3.640055,3.642344,3.646917,3.651484,3.651484,3.660601,3.660601,3.662877,3.662877,3.674235,3.681032,3.687818,3.687818,3.696846,3.696846,3.708099,3.714835,3.714835,3.719319,3.719319,3.730505,3.732738,3.741657,3.746109,3.748333,3.752777,3.754363,3.763863,3.774917,3.777124,3.785939,3.785939,3.794733,3.794733,3.796929,3.796929,3.799123,3.807887,3.829708,3.829708,3.840573,3.851407,3.860052,3.864367,3.872983,3.877284,3.879433,3.883727,3.885259,3.885872,3.894440,3.905125,3.907258,3.911521,3.915780,3.915780,3.924283,3.926406,3.937004,3.949684,3.958114,3.958114,3.964125,3.968627,3.974921,3.991658,4.000000,4.006245,4.010403,4.020779,4.031129,4.033196,4.041452,4.041452,4.049691,4.051749,4.062019,4.068169,4.070802,4.074310,4.082483,4.082483,4.092676,4.102845,4.110961,4.123106,4.143268,4.153312,4.159327,4.163332,4.163332,4.183300,4.203173,4.203173,4.213075,4.219005,4.230839,4.234777,4.242641,4.262237,4.281744,4.281744,4.281744,4.289522,4.291464,4.301163,4.320494,4.320494,4.330127,4.335897,4.339739,4.358899,4.377975,4.396969,4.396969,4.404543,4.415880,4.434712,4.444097,4.472136,4.527693,4.546061,4.571652,4.582576,4.600725,4.609772,4.613644,4.618802,4.626013,4.636809,4.654747,4.679744,4.690416,4.708149,4.716991,4.725816,4.743416,4.830459,4.847680,4.915960,4.989990,5.019960,5.024938,5.049752,5.244044,5.322906,5.338539,5.422177,5.431390,5.686241,6.110101
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1,Unnamed: 353_level_1,Unnamed: 354_level_1,Unnamed: 355_level_1,Unnamed: 356_level_1,Unnamed: 357_level_1,Unnamed: 358_level_1,Unnamed: 359_level_1,Unnamed: 360_level_1,Unnamed: 361_level_1,Unnamed: 362_level_1,Unnamed: 363_level_1,Unnamed: 364_level_1,Unnamed: 365_level_1,Unnamed: 366_level_1,Unnamed: 367_level_1,Unnamed: 368_level_1,Unnamed: 369_level_1,Unnamed: 370_level_1,Unnamed: 371_level_1,Unnamed: 372_level_1,Unnamed: 373_level_1,Unnamed: 374_level_1,Unnamed: 375_level_1,Unnamed: 376_level_1,Unnamed: 377_level_1,Unnamed: 378_level_1,Unnamed: 379_level_1,Unnamed: 380_level_1,Unnamed: 381_level_1,Unnamed: 382_level_1,Unnamed: 383_level_1,Unnamed: 384_level_1,Unnamed: 385_level_1,Unnamed: 386_level_1,Unnamed: 387_level_1,Unnamed: 388_level_1,Unnamed: 389_level_1,Unnamed: 390_level_1,Unnamed: 391_level_1,Unnamed: 392_level_1,Unnamed: 393_level_1,Unnamed: 394_level_1,Unnamed: 395_level_1,Unnamed: 396_level_1,Unnamed: 397_level_1,Unnamed: 398_level_1,Unnamed: 399_level_1,Unnamed: 400_level_1,Unnamed: 401_level_1,Unnamed: 402_level_1,Unnamed: 403_level_1,Unnamed: 404_level_1,Unnamed: 405_level_1,Unnamed: 406_level_1,Unnamed: 407_level_1,Unnamed: 408_level_1,Unnamed: 409_level_1,Unnamed: 410_level_1,Unnamed: 411_level_1,Unnamed: 412_level_1,Unnamed: 413_level_1,Unnamed: 414_level_1,Unnamed: 415_level_1,Unnamed: 416_level_1,Unnamed: 417_level_1,Unnamed: 418_level_1,Unnamed: 419_level_1,Unnamed: 420_level_1,Unnamed: 421_level_1,Unnamed: 422_level_1,Unnamed: 423_level_1,Unnamed: 424_level_1,Unnamed: 425_level_1,Unnamed: 426_level_1,Unnamed: 427_level_1,Unnamed: 428_level_1,Unnamed: 429_level_1,Unnamed: 430_level_1,Unnamed: 431_level_1,Unnamed: 432_level_1,Unnamed: 433_level_1,Unnamed: 434_level_1,Unnamed: 435_level_1,Unnamed: 436_level_1,Unnamed: 437_level_1,Unnamed: 438_level_1,Unnamed: 439_level_1,Unnamed: 440_level_1,Unnamed: 441_level_1,Unnamed: 442_level_1,Unnamed: 443_level_1,Unnamed: 444_level_1,Unnamed: 445_level_1,Unnamed: 446_level_1,Unnamed: 447_level_1,Unnamed: 448_level_1,Unnamed: 449_level_1,Unnamed: 450_level_1,Unnamed: 451_level_1,Unnamed: 452_level_1,Unnamed: 453_level_1,Unnamed: 454_level_1,Unnamed: 455_level_1,Unnamed: 456_level_1,Unnamed: 457_level_1,Unnamed: 458_level_1,Unnamed: 459_level_1,Unnamed: 460_level_1,Unnamed: 461_level_1,Unnamed: 462_level_1,Unnamed: 463_level_1,Unnamed: 464_level_1,Unnamed: 465_level_1,Unnamed: 466_level_1,Unnamed: 467_level_1,Unnamed: 468_level_1,Unnamed: 469_level_1,Unnamed: 470_level_1,Unnamed: 471_level_1,Unnamed: 472_level_1
1.0,5.0,3.0,,2.0,,,,5.0,,4.0,,3.0,,8.0,,7.0,6.0,3.0,5.0,4.0,3.0,,5.0,8.0,4.0,3.0,7.0,2.0,9.0,5.0,5.0,,6.0,,,1.0,11.0,1.0,9.0,12.0,,13.0,,,,,4.0,14.0,,,9.0,,,,11.0,,5.0,11.0,,,,4.0,,2.0,,10.0,,,,,6.0,,5.0,,,1.0,,,2.0,,,,,5.0,,4.0,,,,,15.0,12.0,10.0,7.0,,,15.0,,5.0,,,24.0,9.0,,,,,,,10.0,,,,4.0,,,,,,,20.0,4.0,,6.0,4.0,,8.0,,,3.0,3.0,,5.0,1.0,15.0,,,,,,,,,,,,,6.0,,,,,9.0,,,,,,2.0,6.0,,4.0,5.0,,,10.0,,,,,,12.0,,,,,,6.0,,,7.0,7.0,,,,1.0,,10.0,,12.0,,,4.0,7.0,,,4.0,5.0,,,,6.0,,,,1.0,,2.0,,,,,,,9.0,,,8.0,,,,,,,16.0,,,5.0,6.0,,,,,4.0,,,,,,4.0,,10.0,,,,,,,,,,3.0,,,7.0,,,,5.0,,,,,7.0,6.0,15.0,,,,,3.0,3.0,,,2.0,,,,,,,,,,,,,2.0,,7.0,,,,,,,,,3.0,,,,,4.0,,,5.0,,,,,,,,,,6.0,,,3.0,,5.0,,,3.0,,,,,,5.0,,,,,3.0,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,3.0,,,,,,8.0,,,,,,,,,,,,,,3.0,,,,,3.0,,4.0,,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,2.0,1.0,2.0,,,,,,,,,,,,,,
2.0,3.0,,5.0,2.0,4.0,,5.0,14.0,,,4.0,6.0,3.0,6.0,3.0,,,9.0,,4.0,,8.0,,11.0,,3.0,,4.0,18.0,,19.0,5.0,6.0,3.0,4.0,14.0,17.0,3.0,,3.0,,,,,9.0,10.0,,22.0,,,,,9.0,,9.0,,2.0,19.0,6.0,,7.0,4.0,9.0,6.0,7.0,2.0,3.0,5.0,,,20.0,5.0,5.0,,11.0,5.0,15.0,,2.0,5.0,7.0,6.0,,,9.0,27.0,,,6.0,18.0,10.0,16.0,10.0,,,,36.0,,15.0,,,69.0,,5.0,,,,10.0,6.0,27.0,3.0,9.0,,8.0,,6.0,,12.0,,,55.0,8.0,7.0,6.0,7.0,,,,,4.0,1.0,,5.0,7.0,15.0,,,5.0,,,6.0,9.0,20.0,,7.0,1.0,,12.0,,10.0,6.0,5.0,33.0,,,7.0,,,4.0,9.0,,,2.0,,,20.0,3.0,,10.0,,,18.0,5.0,,,3.0,,15.0,,4.0,,2.0,,,,11.0,,8.0,,31.0,,5.0,4.0,14.0,,,16.0,5.0,,,,6.0,,20.0,,3.0,6.0,15.0,,7.0,,,9.0,3.0,18.0,,,,,,,,,,4.0,,,5.0,19.0,,,3.0,,4.0,5.0,,,,,2.0,,10.0,,,21.0,7.0,,,,,,,,4.0,7.0,5.0,,5.0,13.0,,,,,,,,4.0,5.0,,,,20.0,20.0,,11.0,,,,5.0,3.0,,4.0,,,,5.0,,16.0,,14.0,,,,3.0,,9.0,,,3.0,3.0,6.0,4.0,,11.0,,,,3.0,,,,,,,,,7.0,,4.0,,,3.0,,,,,,9.0,,,5.0,,,,3.0,,,,,,5.0,,,1.0,,,,,3.0,4.0,,,,,,,,,6.0,,,,6.0,,,1.0,,,,,,1.0,,,,,3.0,1.0,,2.0,5.0,,,,,,5.0,,5.0,,3.0,8.0,,,6.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,3.0,,,,,,,,,,,6.0,,,,5.0,,,,,,2.0,5.0,,3.0,,,,,,,,,,,,,
3.0,,,,,2.0,,,9.0,3.0,,4.0,,,6.0,,,6.0,,,5.0,,,,7.0,,,,,,,10.0,,3.0,3.0,,,21.0,,,6.0,,,7.0,3.0,,5.0,,8.0,5.0,,,3.0,15.0,6.0,6.0,,8.0,33.0,,,,,,,,,,,4.0,,8.0,,,8.0,,3.0,15.0,,4.0,,,12.0,5.0,,,20.0,,,,9.0,,,,,,,8.0,,20.0,4.0,12.0,56.0,,10.0,7.0,,,,,27.0,,,,4.0,,,6.0,8.0,,5.0,44.0,,,,4.0,6.0,4.0,1.0,7.0,1.0,5.0,7.0,12.0,4.0,25.0,,,5.0,,,3.0,,,,,4.0,7.0,,12.0,6.0,,,21.0,,,,8.0,7.0,,27.0,,,3.0,,7.0,3.0,,,,8.0,,18.0,5.0,,,,,9.0,,,,5.0,,,,,,3.0,20.0,18.0,,5.0,8.0,12.0,,,4.0,,5.0,,,,,5.0,12.0,12.0,,15.0,,,,,,,10.0,,,,4.0,7.0,5.0,6.0,3.0,,20.0,,,,13.0,,,3.0,,,5.0,7.0,,,3.0,,3.0,5.0,,,18.0,,,,5.0,,,15.0,,,,,,5.0,13.0,,,,,,,,4.0,,,,3.0,13.0,18.0,,5.0,,,,,10.0,1.0,7.0,4.0,7.0,,,6.0,2.0,,3.0,6.0,,,,,,,5.0,,,4.0,,,5.0,5.0,,,,9.0,,,,,,,,9.0,,,,,,,5.0,,,,1.0,,4.0,10.0,,,,,,,,,,,,,7.0,,,,,3.0,,,2.0,,,,,,,14.0,,,4.0,,,,4.0,,,,,,,,,,,,4.0,,,,,,,,5.0,,2.0,,,3.0,4.0,,,3.0,,,,,,,,,1.0,6.0,,,1.0,,,,,,,,,,,,,,,,,1.0,,,,,,,6.0,,,,,,,,,,,,,,,4.0,,,,,,,,,,3.0,,,,,,,,,,,,,,,
4.0,,,,2.0,,6.0,,11.0,,,6.0,,3.0,7.0,3.0,,,,,10.0,,,,9.0,,6.0,,6.0,,,12.0,,,6.0,4.0,,46.0,4.0,,9.0,5.0,,,9.0,,,,28.0,5.0,6.0,9.0,6.0,8.0,,5.0,6.0,5.0,40.0,,5.0,7.0,8.0,9.0,,,,,10.0,8.0,6.0,22.0,,5.0,,,,9.0,10.0,,,,,,5.0,,40.0,5.0,16.0,,20.0,,4.0,,,5.0,3.0,18.0,6.0,19.0,3.0,,76.0,,,,7.0,5.0,5.0,9.0,78.0,,,10.0,16.0,5.0,,,,11.0,,132.0,8.0,7.0,,60.0,,,9.0,,4.0,18.0,,,12.0,81.0,5.0,11.0,5.0,3.0,3.0,39.0,,12.0,7.0,,5.0,,,6.0,10.0,,,99.0,6.0,10.0,7.0,12.0,,,27.0,5.0,20.0,20.0,5.0,7.0,9.0,6.0,7.0,5.0,8.0,6.0,36.0,5.0,5.0,5.0,6.0,9.0,33.0,10.0,20.0,,7.0,5.0,5.0,6.0,,8.0,15.0,20.0,96.0,12.0,10.0,8.0,57.0,6.0,6.0,4.0,10.0,5.0,7.0,9.0,3.0,9.0,5.0,,8.0,,88.0,10.0,,15.0,9.0,,15.0,21.0,15.0,9.0,,39.0,,10.0,12.0,6.0,8.0,,7.0,6.0,10.0,122.0,20.0,32.0,39.0,6.0,12.0,20.0,,5.0,7.0,21.0,3.0,6.0,5.0,6.0,20.0,101.0,,10.0,7.0,15.0,5.0,21.0,36.0,7.0,24.0,,5.0,6.0,5.0,111.0,6.0,5.0,24.0,7.0,,15.0,,16.0,10.0,5.0,7.0,18.0,18.0,30.0,6.0,153.0,7.0,20.0,7.0,5.0,59.0,18.0,,8.0,,7.0,5.0,6.0,20.0,2.0,127.0,12.0,5.0,20.0,27.0,6.0,11.0,10.0,,15.0,,10.0,16.0,7.0,74.0,10.0,5.0,,36.0,30.0,9.0,28.0,5.0,7.0,6.0,5.0,5.0,106.0,10.0,16.0,36.0,5.0,12.0,10.0,5.0,27.0,3.0,5.0,10.0,12.0,12.0,68.0,5.0,10.0,5.0,27.0,6.0,32.0,10.0,5.0,6.0,,6.0,10.0,76.0,6.0,10.0,12.0,7.0,42.0,,5.0,28.0,6.0,5.0,20.0,6.0,12.0,6.0,80.0,42.0,6.0,28.0,,15.0,5.0,90.0,6.0,5.0,16.0,7.0,10.0,17.0,4.0,15.0,5.0,9.0,3.0,10.0,8.0,80.0,5.0,24.0,9.0,7.0,12.0,,,146.0,,4.0,39.0,4.0,15.0,12.0,6.0,10.0,4.0,66.0,5.0,7.0,10.0,21.0,3.0,15.0,,5.0,67.0,14.0,4.0,5.0,21.0,12.0,60.0,30.0,12.0,8.0,5.0,5.0,5.0,63.0,12.0,6.0,6.0,3.0,5.0,4.0,12.0,6.0,3.0,8.0,5.0,,44.0,6.0,6.0,3.0,5.0,16.0,15.0,4.0,2.0,14.0,3.0,5.0,5.0,3.0,,7.0,,,3.0,3.0,5.0,3.0,3.0,,,2.0,6.0,4.0,3.0,5.0,5.0,4.0,2.0,2.0,3.0,2.0,5.0,2.0,2.0,3.0


In [71]:
# cumulative education is sum of educ level minus absolute value of educ level difference
train_data['cumulative_education'] = train_data['edjefa'] + train_data['edjefe'] - abs(train_data['edjefa'] - train_data['edjefe'])
train_data[(train_data['Target'] == 1 )][['overcrowding','meaneduc' ,'rooms', 'bedrooms', 'hogar_total' ,'edjefa','edjefe','cumulative_education', 'SQBdependency', 'Target']]


Unnamed: 0,overcrowding,meaneduc,rooms,bedrooms,hogar_total,edjefa,edjefe,cumulative_education,SQBdependency,Target
154,1.666667,9.5,5,3,5,0,8,0,2.25,1
155,1.666667,9.5,5,3,5,0,8,0,2.25,1
156,1.666667,9.5,5,3,5,0,8,0,2.25,1
157,1.666667,9.5,5,3,5,0,8,0,2.25,1
158,1.666667,9.5,5,3,5,0,8,0,2.25,1
...,...,...,...,...,...,...,...,...,...,...
9508,1.000000,4.0,4,2,2,0,3,0,0.00,1
9509,1.000000,4.0,4,2,2,0,3,0,0.00,1
9514,1.000000,5.0,3,2,2,5,0,0,1.00,1
9515,1.000000,5.0,3,2,2,5,0,0,1.00,1


In [70]:
train_data[(train_data['edjefa'] > 0) & (train_data['edjefe'] > 0)][['edjefa','edjefe']]

Unnamed: 0,edjefa,edjefe


In [67]:
train_data[(train_data['cumulative_education'] > 5) & (train_data['Target'] == 1 )][['overcrowding','meaneduc' ,'rooms', 'bedrooms', 'hogar_total' ,'edjefa','edjefe','cumulative_education', 'Target']]

Unnamed: 0,overcrowding,meaneduc,rooms,bedrooms,hogar_total,edjefa,edjefe,cumulative_education,Target


In [60]:
train_data[(train_data['edjefa'] > 1) & (train_data['edjefe'] > 5) & (train_data['Target'] == 1 )][['overcrowding','meaneduc' ,'rooms', 'bedrooms', 'hogar_total' ,'edjefa','edjefe', 'Target']]

Unnamed: 0,overcrowding,meaneduc,rooms,bedrooms,hogar_total,edjefa,edjefe,Target


In [14]:
train_data[(train_data['rooms'] > 5) & (train_data['Target'] == 1)]

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
237,ID_65ae0bd1f,,0,6,0,1,1,0,,0,...,64,7744,1,0,0,0.25,64.00,64.00,7744,1
322,ID_dab0d86a2,160000.0,0,6,0,1,1,0,,0,...,36,900,36,0,16,4.00,4.00,36.00,900,1
987,ID_5152dd62c,,0,7,0,1,1,0,,1,...,9,121,9,0,4,2.56,4.00,36.00,121,1
988,ID_0a204866b,,0,7,0,1,1,0,,1,...,36,900,9,0,4,2.56,4.00,36.00,900,1
989,ID_0b4c30044,,0,7,0,1,1,0,,1,...,0,36,9,0,4,2.56,4.00,36.00,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9305,ID_ae71f8d9b,,0,6,0,1,0,0,,2,...,36,3136,36,36,4,4.00,0.25,12.25,3136,1
9306,ID_f1355afb3,,0,6,0,1,0,0,,2,...,0,16,36,36,4,4.00,0.25,12.25,16,1
9307,ID_ce6435b64,,0,6,0,1,0,0,,2,...,0,0,36,36,4,4.00,0.25,12.25,0,1
9308,ID_e0338b503,,0,6,0,1,0,0,,2,...,9,729,36,36,4,4.00,0.25,12.25,729,1


In [20]:
train_data[(train_data['rooms'] > 8) & (train_data['Target'] == 2)]

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
1660,ID_562104189,,0,9,0,1,1,0,,1,...,4,100,49,0,9,2.25,0.5625,27.5625,100,2
1661,ID_5feecc29c,,0,9,0,1,1,0,,1,...,36,841,49,0,9,2.25,0.5625,27.5625,841,2
1662,ID_e0b20fae2,,0,9,0,1,1,0,,1,...,25,625,49,0,9,2.25,0.5625,27.5625,625,2
1663,ID_0b37aa51b,,0,9,0,1,1,0,,1,...,1,49,49,0,9,2.25,0.5625,27.5625,49,2
1664,ID_064a73aeb,,0,9,0,1,1,0,,1,...,16,441,49,0,9,2.25,0.5625,27.5625,441,2
1665,ID_6c22df966,,0,9,0,1,1,0,,1,...,36,2025,49,0,9,2.25,0.5625,27.5625,2025,2
1666,ID_f3715e469,,0,9,0,1,1,0,,1,...,0,16,49,0,9,2.25,0.5625,27.5625,16,2
4135,ID_904781cfa,,0,9,0,1,1,0,,1,...,16,5184,25,0,9,0.694444,16.0,64.0,5184,2
4136,ID_82a6f208b,,0,9,0,1,1,0,,1,...,0,0,25,0,9,0.694444,16.0,64.0,0,2
4137,ID_943f4445b,,0,9,0,1,1,0,,1,...,36,1600,25,0,9,0.694444,16.0,64.0,1600,2


In [18]:
train_data[(train_data['rooms'] > 5) & (train_data['Target'] == 3)]

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
406,ID_7d5b483e6,180000.0,0,7,0,1,1,0,,0,...,121,1521,49,0,25,1.960000,6.2500,72.250000,1521,3
407,ID_16669b3ac,180000.0,0,7,0,1,1,0,,0,...,0,25,49,0,25,1.960000,6.2500,72.250000,25,3
408,ID_14b967c3b,180000.0,0,7,0,1,1,0,,0,...,16,144,49,0,25,1.960000,6.2500,72.250000,144,3
409,ID_aa0e92ddc,180000.0,0,7,0,1,1,0,,0,...,0,49,49,0,25,1.960000,6.2500,72.250000,49,3
412,ID_6895c3ee3,180000.0,0,7,0,1,1,0,,0,...,36,2025,49,0,25,1.960000,6.2500,72.250000,2025,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9453,ID_2f5ad837a,,0,6,0,1,1,0,,0,...,4,64,49,0,4,3.062500,0.5625,12.959999,64,3
9454,ID_d74271567,,0,6,0,1,1,0,,0,...,36,1681,49,0,4,3.062500,0.5625,12.959999,1681,3
9455,ID_dc7068500,,0,6,0,1,1,0,,0,...,0,3481,49,0,4,3.062500,0.5625,12.959999,3481,3
9456,ID_cd592016b,,0,6,0,1,1,0,,0,...,36,256,49,0,4,3.062500,0.5625,12.959999,256,3


In [5]:
train_data.corr()

ValueError: could not convert string to float: 'ID_279628684'