In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [105]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [106]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [107]:
test.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,,0,5,0,1,1,0,,1,...,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,,0,5,0,1,1,0,,1,...,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,,0,5,0,1,1,0,,1,...,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,,0,14,0,1,1,1,1.0,0,...,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,18,121,324,1,0,1,0.25,64.0,,324


**Explore the data**

In [108]:
train.shape

(9557, 143)

In [109]:
test.shape

(23856, 142)

**Let us identify our output variable**

In [110]:
for i in train.columns:
    if i not in test.columns:
        print("Our output variable is {}".format(i))

Our output variable is Target


**Lets Understand the type of data.**

In [111]:
print(train.dtypes.value_counts())

int64      130
float64      8
object       5
dtype: int64


In [112]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(8), int64(130), object(5)
memory usage: 10.4+ MB
None


**Types of Datasets**

In [113]:
for i in train.columns:
    a=train[i].dtype
    if a == 'object':
        print(i)

Id
idhogar
dependency
edjefe
edjefa


In [114]:
train.drop(['Id', 'idhogar'],axis=1, inplace=True)

In [115]:
train['dependency'].value_counts()

yes          2192
no           1747
.5           1497
2             730
1.5           713
.33333334     598
.66666669     487
8             378
.25           260
3             236
4             100
.75            98
.2             90
.40000001      84
1.3333334      84
2.5            77
5              24
.80000001      18
1.25           18
3.5            18
2.25           13
.71428573      12
1.75           11
.22222222      11
1.2            11
.83333331      11
.2857143        9
1.6666666       8
.60000002       8
6               7
.16666667       7
Name: dependency, dtype: int64

**Lets Convert object variables into numerical data.**

In [116]:
def map(i):
    
    if i=='yes':
        return(float(1))
    elif i=='no':
        return(float(0))
    else:
        return(float(i))

In [117]:
train['dependency']=train['dependency'].apply(map)

In [118]:
for i in train.columns:
    a=train[i].dtype
    if a == 'object':
        print(i)

edjefe
edjefa


In [119]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 141 entries, v2a1 to Target
dtypes: float64(9), int64(130), object(2)
memory usage: 10.3+ MB


In [120]:
train['edjefe']=train['edjefe'].apply(map)
train['edjefa']=train['edjefa'].apply(map)

In [121]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 141 entries, v2a1 to Target
dtypes: float64(11), int64(130)
memory usage: 10.3 MB


**Lets identify variable with 0 varinace**

In [122]:
var_df=pd.DataFrame(np.var(train,0),columns=['variance'])
var_df.sort_values(by='variance').head(15)
print('Below are columns with variance 0.')
col=list((var_df[var_df['variance']==0]).index)
print(col)

Below are columns with variance 0.
['elimbasu5']


In [127]:
train.drop('r4t3',axis=1,inplace=True)

**Check if there is a house without a family head.**
- "parentesco1" =1 if household head

In [128]:
train.parentesco1.value_counts()

0    6584
1    2973
Name: parentesco1, dtype: int64

In [129]:
pd.crosstab(train['edjefa'],train['edjefe'])

edjefe,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0
edjefa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,435,123,194,307,137,222,1845,234,257,486,...,113,103,208,285,134,202,19,14,7,43
1.0,69,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.0,84,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3.0,152,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4.0,136,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5.0,176,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6.0,947,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7.0,179,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8.0,217,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9.0,237,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


From above data there are 435 families with no family head

**Count how many null values are existing in columns.**

In [130]:
train.isna().sum().value_counts()

0       135
5         2
7928      1
6860      1
7342      1
dtype: int64

In [131]:
train['Target'].isna().sum()

0

There are no null values in target variable

**Identify and fillna of other variable.**

In [132]:
float_col=[]
for i in train.columns:
    a=train[i].dtype
    if a == 'float64':
        float_col.append(i)
print(float_col)

['v2a1', 'v18q1', 'rez_esc', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned']


In [133]:
train[float_col].isna().sum()

v2a1               6860
v18q1              7342
rez_esc            7928
dependency            0
edjefe                0
edjefa                0
meaneduc              5
overcrowding          0
SQBovercrowding       0
SQBdependency         0
SQBmeaned             5
dtype: int64

In [134]:
train['v18q1'].value_counts()

1.0    1586
2.0     444
3.0     129
4.0      37
5.0      13
6.0       6
Name: v18q1, dtype: int64

In [135]:
pd.crosstab(train['tipovivi1'],train['v2a1'])

v2a1,0.0,12000.0,13000.0,14000.0,15000.0,16000.0,17000.0,20000.0,23000.0,25000.0,...,570540.0,600000.0,620000.0,684648.0,700000.0,770229.0,800000.0,855810.0,1000000.0,2353477.0
tipovivi1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,29,3,4,3,3,2,4,22,5,21,...,25,11,3,3,7,3,4,11,7,2


- tipovivi3, =1 rented
- v18q, owns a tablet

In [136]:
pd.crosstab(train['v18q1'],train['v18q'])

v18q,1
v18q1,Unnamed: 1_level_1
1.0,1586
2.0,444
3.0,129
4.0,37
5.0,13
6.0,6


In [137]:
train['v2a1'].fillna(0,inplace=True)
train['v18q1'].fillna(0,inplace=True)

In [138]:
train.drop(['tipovivi3', 'v18q','rez_esc','elimbasu5'],axis=1,inplace=True)

In [139]:
train['meaneduc'].fillna(np.mean(train['meaneduc']),inplace=True)
train['SQBmeaned'].fillna(np.mean(train['SQBmeaned']),inplace=True)
print(train.isna().sum().value_counts())

0    136
dtype: int64


In [140]:
int_col=[]
for i in train.columns:
    a=train[i].dtype
    if a == 'int64':
        int_col.append(i)
print(int_col)

['hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'r4h1', 'r4h2', 'r4h3', 'r4m1', 'r4m2', 'r4m3', 'r4t1', 'r4t2', 'tamhog', 'tamviv', 'escolari', 'hhsize', 'paredblolad', 'paredzocalo', 'paredpreb', 'pareddes', 'paredmad', 'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisocemento', 'pisoother', 'pisonatur', 'pisonotiene', 'pisomadera', 'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 'abastaguadentro', 'abastaguafuera', 'abastaguano', 'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 'sanitario2', 'sanitario3', 'sanitario5', 'sanitario6', 'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 'elimbasu6', 'epared1', 'epared2', 'epared3', 'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 'parentesco1', 'parentesco2', 'parentesco3', 'parentesco

In [141]:
train[int_col].isna().sum().value_counts()

0    126
dtype: int64

In [142]:
train.Target.value_counts()

4    5996
2    1597
3    1209
1     755
Name: Target, dtype: int64

**Set the poverty level of the members and the head of the house same in a family.**

In [143]:
Poverty_level=train[train['v2a1'] !=0]

In [144]:
Poverty_level.shape

(2668, 136)

In [145]:
poverty_level=Poverty_level.groupby('area1')['v2a1'].apply(np.median)

In [146]:
poverty_level

area1
0     80000.0
1    140000.0
Name: v2a1, dtype: float64

In [147]:
def povert(x):
    if x<8000:
        return('Below poverty level')
    
    elif x>140000:
        return('Above poverty level')
    elif x<140000:
        return('Below poverty level: Ur-ban ; Above poverty level : Rural ')

In [148]:
c=Poverty_level['v2a1'].apply(povert)

In [149]:
c.shape

(2668,)

In [150]:
pd.crosstab(c,Poverty_level['area1'])

area1,0,1
v2a1,Unnamed: 1_level_1,Unnamed: 2_level_1
Above poverty level,139,1103
Below poverty level: Ur-ban ; Above poverty level : Rural,306,1081


In [151]:
X_data=train.drop('Target',axis=1)
Y_data=train.Target

In [152]:
X_data_col=X_data.columns

**Predict the accuracy using random forest classifier.**

In [160]:
x_features= train.iloc[:,0:-1]
y_features= train.iloc[:,-1]
print(x_features.shape)
print(y_features.shape)

(9557, 135)
(9557,)


In [161]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report

x_train,x_test,y_train,y_test=train_test_split(x_features,y_features,test_size=0.2,random_state=1)
rmclassifier = RandomForestClassifier()

In [162]:
rmclassifier.fit(x_train,y_train)

RandomForestClassifier()

In [163]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(n_estimators=10)

In [164]:
y_predict = rmclassifier.predict(x_test)

In [165]:
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))
print(classification_report(y_test,y_predict))

0.9314853556485355
[[ 113    9    0   25]
 [   3  293    5   32]
 [   0    6  178   45]
 [   0    5    1 1197]]
              precision    recall  f1-score   support

           1       0.97      0.77      0.86       147
           2       0.94      0.88      0.91       333
           3       0.97      0.78      0.86       229
           4       0.92      1.00      0.96      1203

    accuracy                           0.93      1912
   macro avg       0.95      0.86      0.90      1912
weighted avg       0.93      0.93      0.93      1912

