In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
## Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# do this to make Pandas show all the columns of a DataFrame, otherwise it just shows a summary
pd.set_option('display.max_columns', None) 

# Modelling Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans

# Modelling Helpers
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV

from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import CategoricalEncoder
from sklearn.preprocessing.data import QuantileTransformer

from sklearn.pipeline import Pipeline

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
#%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [None]:
#load data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [None]:
display(train.shape)
display(train.head())

In [None]:
display(test.shape)
display(test.head())

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
print([x for x in train.columns if train[x].dtype=='O'])
print([x for x in test.columns if test[x].dtype=='O'])

print('\n')
print([x for x in train.columns if train[x].dtype=='float'])
print([x for x in test.columns if test[x].dtype=='float'])

print('\n')
print([x for x in train.columns if train[x].dtype=='int'])
print([x for x in test.columns if test[x].dtype=='int'])

In [None]:
# Count the % null columns
def count_nulls(df):
    null_counter = df.isnull().sum(axis=0)
    null_counter = null_counter[null_counter > 0]
    null_percent = df.isnull().sum(axis=0) / df.shape[0] * 100
    null_percent = null_percent[null_percent > 0]
    null_df = pd.concat([null_counter,null_percent],axis=1)
    null_df.columns = ['count','percent']
    display(null_df)
    
# Count the null columns
#null_columns=train.columns[train.isnull().any()]
#train[null_columns].isnull().sum()

count_nulls(train)
count_nulls(test)

In [None]:
print('Non Null Count of "v2a1" = {}' .format(train.v2a1.notnull().sum()))
print('Null Count of "v2a1" = {}' .format(train.v2a1.isnull().sum()))

print('\n')
print(train[(train.tipovivi2==1) |(train.tipovivi3==1)].shape)
print(train[(train.tipovivi2==1) |(train.tipovivi3==1)].v2a1.isnull().sum())

print('\n')
print(train[(train.tipovivi1==1) |(train.tipovivi4==1) | (train.tipovivi5==1)].shape)
print(train[(train.tipovivi1==1) |(train.tipovivi4==1) | (train.tipovivi5==1)].v2a1.isnull().sum())

# Conclusion: 
# Household staying in rented house or own house with paying installments are paying the rent.
# Household staying in own house or precarious/assigned/borrowed house are not paying any rent.
# All null values will be filled with zeros.

In [None]:
print('Non Null Count of "v2a1" = {}' .format(test.v2a1.notnull().sum()))
print('Null Count of "v2a1" = {}' .format(test.v2a1.isnull().sum()))

print('\n')
print(test[(test.tipovivi2==1) |(test.tipovivi3==1)].shape)
print(test[(test.tipovivi2==1) |(test.tipovivi3==1)].v2a1.isnull().sum())

print('\n')
print(test[(test.tipovivi1==1) |(test.tipovivi4==1) | (test.tipovivi5==1)].shape)
print(test[(test.tipovivi1==1) |(test.tipovivi4==1) | (test.tipovivi5==1)].v2a1.isnull().sum())

# Conclusion: 
# Household staying in rented house or own house with paying installments are paying the rent.
# Household staying in own house or precarious/assigned/borrowed house are not paying any rent.
# All null values will be filled with zeros.

In [None]:
# impute missing values of 'v2a1'
train.v2a1.fillna(0, inplace=True)
test.v2a1.fillna(0, inplace=True)

count_nulls(train)
count_nulls(test)

In [None]:
print('Non Null Count of "v18q1" = {}' .format(train.v18q1.notnull().sum()))
print('Null Count of "v18q1" = {}' .format(train.v18q1.isnull().sum()))

print('\n')
print('Non Zero Count of "v18q" = {}' .format(train[(train['v18q']==1)].v18q.count()))
print('Zero Count of "v18q" = {}' .format(train[(train['v18q']==0)].v18q.count()))

# Conclusion: 
# The variable "v18q1" is null if the household does not own any tablet.
# All null values will be filled with zeros.

In [None]:
print('Non Null Count of "v18q1" = {}' .format(test.v18q1.notnull().sum()))
print('Null Count of "v18q1" = {}' .format(test.v18q1.isnull().sum()))

print('\n')
print('Non Zero Count of "v18q" = {}' .format(test[(test['v18q']==1)].v18q.count()))
print('Zero Count of "v18q" = {}' .format(test[(test['v18q']==0)].v18q.count()))

# Conclusion: 
# The variable "v18q1" is null if the household does not own any tablet.
# All null values will be filled with zeros.

In [None]:
# impute missing values of 'v18q1'
train.v18q1.fillna(0, inplace=True)
test.v18q1.fillna(0, inplace=True)

count_nulls(train)
count_nulls(test)

In [None]:
print(train['rez_esc'].unique())
print(test['rez_esc'].unique())

In [None]:
print("Training Set:")
print('Non Null Count of "rez_esc" = {}' .format(train.rez_esc.notnull().sum()))
print('Null Count of "rez_esc" = {}' .format(train.rez_esc.isnull().sum()))

print('\n')
print('Max Age = {}' .format(train[train['rez_esc'].notnull()][['age', 'escolari', 'rez_esc']].age.max()))
print('Min Age = {}' .format(train[train['rez_esc'].notnull()][['age', 'escolari', 'rez_esc']].age.min()))

print('\n')
print("Testing Set:")
print('Non Null Count of "rez_esc" = {}' .format(test.rez_esc.notnull().sum()))
print('Null Count of "rez_esc" = {}' .format(test.rez_esc.isnull().sum()))

print('\n')
print('Max Age = {}' .format(test[test['rez_esc'].notnull()][['age', 'escolari', 'rez_esc']].age.max()))
print('Min Age = {}' .format(test[test['rez_esc'].notnull()][['age', 'escolari', 'rez_esc']].age.min()))

# Observation:
# The variable "rez_esc" is non null for age between 7 to 17 years

In [None]:
train[train['rez_esc'].notnull()][['age', 'escolari', 'rez_esc']]

# Observation:
# Looking at the data, ideal age to start schooling is 7 years & ideal age to finish schooling is 17 years
# So someone 15 years ("age") old should complete 8 years ("escolari") of schooling (15-7)
# Thus Years behind in school ("rez_esc") = "age" - 7 - "escolari"
# The variable "rez_esc" is positive if someone is behind in school, otherwise zero
# For non-schooling age ("age"<7 & "age">17), "rez_esc" is not calculated and thus it is null

In [None]:
# Validating the observation for the variable "rez_esc"
train['rez_esc_calculated'] = train.age - train.escolari - 7
train.rez_esc_calculated[train['rez_esc_calculated']<0] = 0
train.rez_esc_calculated[train['age']<7]= np.nan
train.rez_esc_calculated[train['age']>17]= np.nan
count_nulls(train)

test['rez_esc_calculated'] = test.age - test.escolari - 7
test.rez_esc_calculated[test['rez_esc_calculated']<0] = 0
test.rez_esc_calculated[test['age']<7]= np.nan
test.rez_esc_calculated[test['age']>17]= np.nan
count_nulls(test)

# There is an extra null value in "res_esc" in both training & testing set

In [None]:
# Validating the observation for the variable "rez_esc"
print("Training Set:")
train['rez_esc_bol'] = (train['rez_esc'] == train['rez_esc_calculated'])
display(train[(train['rez_esc_bol']==False) & (train['rez_esc_calculated'].notnull())][['age', 'escolari', 'rez_esc', 'rez_esc_calculated', 'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']])

print("Testing Set:")
test['rez_esc_bol'] = (test['rez_esc'] == test['rez_esc_calculated'])
display(test[(test['rez_esc_bol']==False) & (test['rez_esc_calculated'].notnull())][['age', 'escolari', 'rez_esc', 'rez_esc_calculated', 'instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']])


# The calculated value of "rez_esc" do not match for the following rows.
# Looking the data, it is observed the calculated values are correct.
# All null values will be filled with zeros.

In [None]:
# impute missing values of "rez_esc"
train = train.drop(['rez_esc_calculated','rez_esc_bol'],1)
test = test.drop(['rez_esc_calculated','rez_esc_bol'],1)

train['rez_esc'] = train.age - train.escolari - 7
train.rez_esc[train['rez_esc']<0] = 0
train.rez_esc[train['age']<7]= 0
train.rez_esc[train['age']>17]= 0

test['rez_esc'] = test.age - test.escolari - 7
test.rez_esc[test['rez_esc']<0] = 0
test.rez_esc[test['age']<7]= 0
test.rez_esc[test['age']>17]= 0

count_nulls(train)
count_nulls(test)

In [None]:
print("Training Set:")
display(train[train["meaneduc"].isnull()][['idhogar','age','escolari','meaneduc','hogar_adul','hogar_nin','hogar_mayor','hogar_total','parentesco1']])

print("Testing Set:")
display(test[test["meaneduc"].isnull()][['idhogar','age','escolari','meaneduc','hogar_adul','hogar_nin','hogar_mayor','hogar_total','parentesco1']])

In [None]:
print("Training Set:")
display(train[train["hogar_adul"]==0][['idhogar','age','escolari','meaneduc','hogar_adul','hogar_nin','hogar_mayor','hogar_total','parentesco1']])

print("Testing Set:")
display(test[test["hogar_adul"]==0][['idhogar','age','escolari','meaneduc','hogar_adul','hogar_nin','hogar_mayor','hogar_total','parentesco1']])

# Observation:
# The variable "meaneduc" is null for households with no adults ("hogar_adul"=0)
# All null values will be filled with zeros.

In [None]:
# impute missing values of "meaneduc" & "SQBmeaned"
train.meaneduc.fillna(0, inplace=True)
train.SQBmeaned.fillna(0, inplace=True)

test.meaneduc.fillna(0, inplace=True)
test.SQBmeaned.fillna(0, inplace=True)

count_nulls(train)
count_nulls(test)

In [None]:
# The variable "dependency" has lot of "yes" and "no" values
train['dependency_calculated'] = (train.hogar_nin + train.hogar_mayor)/(train.hogar_adul - train.hogar_mayor)
test['dependency_calculated'] = (test.hogar_nin + test.hogar_mayor)/(test.hogar_adul - test.hogar_mayor)

print("Unique values in Training Set:")
print(train['dependency'].unique())

print("\nUnique values in Testing Set:")
print(test['dependency'].unique())

print("\nTraining Set:")
print(train[(train['dependency']=='yes') | (train['dependency']=='no')][['dependency']].count())
print(train[(train['dependency_calculated']==1) | (train['dependency_calculated']==0)][['dependency_calculated']].count())

print("\nTesting Set:")
print(test[(test['dependency']=='yes') | (test['dependency']=='no')][['dependency']].count())
print(test[(test['dependency_calculated']==1) | (test['dependency_calculated']==0)][['dependency_calculated']].count())

# Obsevation:
# All "yes" should be '1' and "no" should be '0'

In [None]:
# The variables "edjefe" and "edjefa" have lot of "yes" and "no" values
display(train[['idhogar','edjefe','edjefa','escolari','male','parentesco1']][:20])
display(test[['idhogar','edjefe','edjefa','escolari','male','parentesco1']][:20])

# Obsevation:
# "edjefe" for a household = "escolari" x "male" x "parentesco1"
# "edjefa" for a household = "escolari" x "female" x "parentesco1"
# All "yes" should be '1' and "no" should be '0'

In [None]:
train.dependency.replace('no','0',inplace=True)
train.dependency.replace('yes','1',inplace=True)
train.dependency = train.dependency.astype('float')

test.dependency.replace('no','0',inplace=True)
test.dependency.replace('yes','1',inplace=True)
test.dependency = test.dependency.astype('float')

train.edjefe.replace('no','0',inplace=True)
train.edjefe.replace('yes','1',inplace=True)
train.edjefe = train.edjefe.astype('float')

test.edjefe.replace('no','0',inplace=True)
test.edjefe.replace('yes','1',inplace=True)
test.edjefe = test.edjefe.astype('float')

train.edjefa.replace('no','0',inplace=True)
train.edjefa.replace('yes','1',inplace=True)
train.edjefa = train.edjefa.astype('float')

test.edjefa.replace('no','0',inplace=True)
test.edjefa.replace('yes','1',inplace=True)
test.edjefa = test.edjefa.astype('float')

train = train.drop(['dependency_calculated'],1)
test = test.drop(['dependency_calculated'],1)

display(train[['idhogar','edjefe','escolari','male','parentesco1']][:20])
display(test[['idhogar','edjefe','escolari','male','parentesco1']][:20])

In [None]:
household_id = pd.unique(train.idhogar)
display(household_id)
display(len(household_id))

In [None]:
# Checking if all members of same household have same Target
hh = []
for id in pd.unique(train.idhogar):
    train_hh = train[train['idhogar']==id]
    train_pp = train_hh[train_hh['parentesco1']==1]
    if not train_pp.empty:
        for j in range(0,len(train_hh)):
            if train_hh.Target.iloc[j] != train_pp.Target.iloc[0]:
                hh.append(id)

hh = pd.unique(hh)       
display(train[train['idhogar'].isin(hh)][['idhogar', 'Target', 'parentesco1']])
print(len(hh))

In [None]:
# Correcting the Target of all members of same household to have same Target
for i in hh:
    train.loc[train['idhogar']==i, 'Target'] = train[(train['idhogar']==i) & (train['parentesco1']==1)].Target.iloc[0]  

display(train[train['idhogar'].isin(hh)][['idhogar', 'Target', 'parentesco1']])

hh = []
for i in range(0,len(household_id)):
    id = household_id[i]
    train_hh = train[train['idhogar']==id]
    train_pp = train_hh[train_hh['parentesco1']==1]
    if not train_pp.empty:
        for j in range(0,len(train_hh)):
            if train_hh.Target.iloc[j] != train_pp.Target.iloc[0]:
                hh.append(id)
                
hh = pd.unique(hh)       
display(train[train['idhogar'].isin(hh)][['idhogar', 'Target', 'parentesco1']])
print(len(hh))

In [None]:
train_id = train.Id
test_id = test.Id

train = train.drop(['Id'],1)
test = test.drop(['Id'],1)

y = train.Target
X = train.drop(['Target'],1)

full = train.append(test, ignore_index = True)

target = {1:'Extereme Poverty', 2:'Moderate Poverty', 3: 'Vulnerable Households', 4:'Non Vulnerable Households'}

In [None]:
plt.figure(figsize=(8,4)) 
sns.countplot(data=train, x=train['Target'], order=train['Target'].value_counts().index)

In [None]:
plt.figure(figsize=(20,10)) 
plt.subplot(2, 3, 1)
sns.barplot(x='Target' , y='computer', data=train)
plt.ylabel("Computer")
plt.subplot(2, 3, 2)
sns.barplot(x='Target' , y='v18q', data=train)
plt.ylabel("Tablet")
plt.subplot(2, 3, 3)
sns.barplot(x='Target' , y='television', data=train)
plt.ylabel("Television")
plt.subplot(2, 3, 4)
sns.barplot(x='Target' , y='refrig', data=train)
plt.ylabel("Refrigerator")
plt.subplot(2, 3, 5)
sns.barplot(x='Target' , y='mobilephone', data=train)
plt.ylabel("Mobile Phone")

In [None]:
plt.figure(figsize=(20,10)) 
plt.subplot(2, 3, 1)
sns.barplot(x='Target' , y='lugar1', data=train)
plt.ylabel("Central")
plt.subplot(2, 3, 2)
sns.barplot(x='Target' , y='lugar2', data=train)
plt.ylabel("Chorotega")
plt.subplot(2, 3, 3)
sns.barplot(x='Target' , y='lugar3', data=train)
plt.ylabel("Pacafafico Central")
plt.subplot(2, 3, 4)
sns.barplot(x='Target' , y='lugar4', data=train)
plt.ylabel("Brunca")
plt.subplot(2, 3, 5)
sns.barplot(x='Target' , y='lugar5', data=train)
plt.ylabel("Hueter Atlafantica")
plt.subplot(2, 3, 6)
sns.barplot(x='Target' , y='lugar6', data=train)
plt.ylabel("Hueter Norte")

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 2, 1)
sns.barplot(x='Target' , y='area1', data=train)
plt.ylabel("Urban")
plt.subplot(1, 2, 2)
sns.barplot(x='Target' , y='area2', data=train)
plt.ylabel("Rural")

train['area1'].corr(train['area2'])

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 2, 1)
sns.barplot(x='Target' , y='male', data=train)
plt.ylabel("Male")
plt.subplot(1, 2, 2)
sns.barplot(x='Target' , y='female', data=train)
plt.ylabel("Female")

train['male'].corr(train['female'])

In [None]:
print(train['tamhog'].corr(train['tamviv']))
print(train['tamhog'].corr(train['hhsize']))

In [None]:
print(train.columns.get_loc("estadocivil1"))

relation = train.iloc[:,75:94]
relation['Target'] = train.Target
relation.head()

plt.figure(figsize=(20,15))
size= len(relation.columns)-1
for i in range(0, size):
    plt.subplot(7, 3, i+1)
    sns.barplot(x='Target', y=relation.columns[i], data=relation)

In [None]:
print(train.columns.get_loc("instlevel1"))

education = train.iloc[:,103:112]
education['Target'] = train.Target
education.head()

plt.figure(figsize=(20,12))
size= len(education.columns)-1
for i in range(0, size):
    plt.subplot(3, 3, i+1)
    sns.barplot(x='Target' , y=education.columns[i], data=education)

In [None]:
plt.figure(figsize=(20,10)) 
plt.subplot(2, 3, 1)
sns.barplot(x='Target' , y='tipovivi1', data=train)
plt.ylabel("Own & Fully Paid")
plt.subplot(2, 3, 2)
sns.barplot(x='Target' , y='tipovivi2', data=train)
plt.ylabel("Own & Paying Installment")
plt.subplot(2, 3, 3)
sns.barplot(x='Target' , y='tipovivi3', data=train)
plt.ylabel("Rented")
plt.subplot(2, 3, 4)
sns.barplot(x='Target' , y='tipovivi4', data=train)
plt.ylabel("Precarious")
plt.subplot(2, 3, 5)
sns.barplot(x='Target' , y='tipovivi5', data=train)
plt.ylabel("Assigned/Borrowed")

In [None]:
print(train.columns.get_loc("paredblolad"))

outside_wall = train.iloc[:,22:30]
outside_wall['Target'] = train.Target
display(outside_wall.head())

plt.figure(figsize=(20,10))
size= len(outside_wall.columns)-1
for i in range(0, size): 
    plt.subplot(2, 4, i+1)
    sns.barplot(x='Target' , y=outside_wall.columns[i], data=outside_wall)

In [None]:
print(train.columns.get_loc("pisomoscer"))

floor = train.iloc[:,30:36]
floor['Target'] = train.Target
display(floor.head())

plt.figure(figsize=(20,10))
size = len(floor.columns)-1
for i in range(0, size):
    plt.subplot(2, 3, i+1)
    sns.barplot(x='Target' , y=floor.columns[i], data=floor)

In [None]:
print(train.columns.get_loc("techozinc"))

roof = train.iloc[:,36:40]
roof['Target'] = train.Target
display(floor.head())

plt.figure(figsize=(20,5))
size = len(roof.columns)-1
for i in range(0, size):
    plt.subplot(1, 4, i+1)
    sns.barplot(x='Target' , y=roof.columns[i], data=roof)

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 2, 1)
sns.barplot(x='Target' , y='abastaguadentro', data=train)
plt.ylabel("Water Provision Inside Dwelling")
plt.subplot(1, 2, 2)
sns.barplot(x='Target' , y='abastaguafuera', data=train)
plt.ylabel("Water Provision Outside Dwelling")

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 3, 1)
sns.barplot(x='Target' , y='public', data=train)
plt.ylabel("Electricity from CNFL, ICE, ESPH/JASEC")
plt.subplot(1, 3, 2)
sns.barplot(x='Target' , y='planpri', data=train)
plt.ylabel("Electricity from Private Plant")
plt.subplot(1, 3, 3)
sns.barplot(x='Target' , y='coopele', data=train)
plt.ylabel("Electricity from Cooperative")

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 4, 1)
sns.barplot(x='Target' , y='sanitario2', data=train)
plt.ylabel("Toilet Connected to Sewer")
plt.subplot(1, 4, 2)
sns.barplot(x='Target' , y='sanitario3', data=train)
plt.ylabel("Toilet Connected to Septic Tank")
plt.subplot(1, 4, 3)
sns.barplot(x='Target' , y='sanitario5', data=train)
plt.ylabel("Toilet Connected to Black Hole")
plt.subplot(1, 4, 4)
sns.barplot(x='Target' , y='sanitario6', data=train)
plt.ylabel("Toilet Connected to Other System")

In [None]:
plt.figure(figsize=(20,5)) 
plt.subplot(1, 3, 1)
sns.barplot(x='Target' , y='energcocinar2', data=train)
plt.ylabel("Cooking Energy - Electricity")
plt.subplot(1, 3, 2)
sns.barplot(x='Target' , y='energcocinar3', data=train)
plt.ylabel("Cooking Energy - Gas")
plt.subplot(1, 3, 3)
sns.barplot(x='Target' , y='energcocinar4', data=train)
plt.ylabel("Cooking Energy - Wood Charcoal")

In [None]:
plt.figure(figsize=(20,10)) 
plt.subplot(2, 3, 1)
sns.barplot(x='Target' , y='elimbasu1', data=train)
plt.ylabel("Rubbish Disposal by Tanker Truck")
plt.subplot(2, 3, 2)
sns.barplot(x='Target' , y='elimbasu2', data=train)
plt.ylabel("Rubbish Disposal by Buried")
plt.subplot(2, 3, 3)
sns.barplot(x='Target' , y='elimbasu3', data=train)
plt.ylabel("Rubbish Disposal by Burning")
plt.subplot(2, 3, 4)
sns.barplot(x='Target' , y='elimbasu4', data=train)
plt.ylabel("Rubbish Disposal by Throwing in Space")
plt.subplot(2, 3, 5)
sns.barplot(x='Target' , y='elimbasu5', data=train)
plt.ylabel("Rubbish Disposal by Throwing in River/Sea")
plt.subplot(2, 3, 6)
sns.barplot(x='Target' , y='elimbasu6', data=train)
plt.ylabel("Rubbish Disposal by Other")

In [None]:
plt.figure(figsize=(20,10)) 
plt.subplot(3, 3, 1)
sns.barplot(x='Target' , y='epared1', data=train)
plt.ylabel("Wall Quality - Bad")
plt.subplot(3, 3, 2)
sns.barplot(x='Target' , y='epared2', data=train)
plt.ylabel("Wall Quality - Regular")
plt.subplot(3, 3, 3)
sns.barplot(x='Target' , y='epared3', data=train)
plt.ylabel("Wall Quality - Good")
plt.subplot(3, 3, 4)
sns.barplot(x='Target' , y='etecho1', data=train)
plt.ylabel("Roof Quality - Bad")
plt.subplot(3, 3, 5)
sns.barplot(x='Target' , y='etecho2', data=train)
plt.ylabel("Roof Quality - Regular")
plt.subplot(3, 3, 6)
sns.barplot(x='Target' , y='etecho3', data=train)
plt.ylabel("Roof Quality - Good")
plt.subplot(3, 3, 7)
sns.barplot(x='Target' , y='eviv1', data=train)
plt.ylabel("Floor Quality - Bad")
plt.subplot(3, 3, 8)
sns.barplot(x='Target' , y='eviv2', data=train)
plt.ylabel("Floor Quality - Regular")
plt.subplot(3, 3, 9)
sns.barplot(x='Target' , y='eviv3', data=train)
plt.ylabel("Floor Quality - Good")

In [None]:
#corr = train.corr().abs()
#corr

#np.fill_diagonal(corr.values, 0)
#np.fill_diagonal(df.values, 0)

#corr_unstacked= corr.unstack()
#corr_unstacked
#corr_sorted = corr_unstacked.sort_values(ascending=False, kind='quicksort')
#to_drop = corr_sorted[corr_sorted>=0.95].to_frame(name='corr')
#to_drop

In [None]:
display(train.shape)
display(test.shape)
display(X.shape)
display(y.shape)
display(full.shape)
# Create correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] >= 0.95)]
display(to_drop)

X.drop(to_drop, axis=1, inplace=True)

pred_X = test.drop(to_drop, axis=1)

X = X.drop(['idhogar'],1)
pred_X = pred_X.drop(['idhogar'],1)

display(X.shape)

display(pred_X.shape)

In [None]:
logreg = LogisticRegression()

c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
logreg_cv.fit(X, y)

print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

pred_Y = logreg_cv.predict(pred_X)
pred = pd.DataFrame({'Id': test_id, 'Target': pred_Y})
print(pred.shape)
print(pred.head())
pred.to_csv('costa_rican_logreg.csv', index = False)

In [None]:
svc = SVC()

#param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'],
#              'C': [.0001, .001, .01, .1, 1, 10, 100, 1000, 10000],
#              'gamma': [.00001, .0001, .001, .01, .1, 1, 10]}

param_grid = {'C': [10, 100, 1000, 10000],
              'gamma': [.00001, .0001, .001]}

svc_cv = GridSearchCV(svc, param_grid, cv=10)
svc_cv.fit(X, y)

print("Tuned SVM Parameters: {}".format(svc_cv.best_params_)) 
print("Best score is {}".format(svc_cv.best_score_))

pred_Y = svc_cv.predict(pred_X)
pred = pd.DataFrame({'Id': test_id, 'Target': pred_Y})
print(pred.shape)
print(pred.head())
pred.to_csv('costa_rican_svc.csv', index = False)

In [None]:
rfclass = RandomForestClassifier()

param_grid = {"n_estimators": np.arange(5, 100, 5),
              "max_depth": np.arange(1, 50, 5),
              "min_samples_split": np.arange(1,50,1),
              "min_samples_leaf": np.arange(1,50,1),
              "max_leaf_nodes": np.arange(2,50,2)}

rfclass_cv = RandomizedSearchCV(rfclass, param_grid, cv=5)
rfclass_cv.fit(X, y)

print("Tuned Random Forest Parameters: {}".format(rfclass_cv.best_params_)) 
print("Best score is {}".format(rfclass_cv.best_score_))

pred_Y = rfclass_cv.predict(pred_X)
pred = pd.DataFrame({'Id': test_id, 'Target': pred_Y})
print(pred.shape)
print(pred.head())
pred.to_csv('costa_rican_rfclass.csv', index = False)

In [None]:
knn = KNeighborsClassifier()

param_grid = {'n_neighbors': np.arange(5, 100, 5),
              'weights': ['uniform', 'distance'],
              'leaf_size': np.arange(5, 100, 5)}

knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X, y)

print("Tuned KNN Parameters: {}".format(knn_cv.best_params_)) 
print("Best score is {}".format(knn_cv.best_score_))

pred_Y = knn_cv.predict(pred_X)
pred = pd.DataFrame({'Id': test_id, 'Target': pred_Y})
print(pred.shape)
print(pred.head())
pred.to_csv('costa_rican_knn.csv', index = False)

In [None]:
#household_id = pd.unique(train.idhogar)
#household_id

#cc = train[['idhogar','parentesco1']]
#dd = test[['idhogar','parentesco1']]

#hh_cc = pd.unique(cc.idhogar)
#hh_dd = pd.unique(dd.idhogar)

#ee = cc.groupby('idhogar')
#ff = cc.groupby('idhogar')['parentesco1'].sum()
#gg = ff[ff==0]

#hh = pd.DataFrame(columns=train.columns)
#for i in range(0,len(train)):
    #for j in range(0, len(gg)):
        #if train.idhogar[i]==gg.index[j]:
            #hh = hh.append(train.iloc[i,:])
            
#hh
#gg