In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns



# Load data

In [2]:
train_values = pd.read_csv('../data/train_values.csv', index_col='building_id')
train_labels = pd.read_csv('../data/train_labels.csv', index_col='building_id')


# Explore features

In [3]:
# Categorical columns = 
categorical_columns = [c for c in train_values.select_dtypes(include=['object'])]
for c in categorical_columns:
    print(c)

land_surface_condition
foundation_type
roof_type
ground_floor_type
other_floor_type
position
plan_configuration
legal_ownership_status


In [4]:
# Drop building_id (index) from X and y
train_values.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)

# Splitting the data

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_values, train_labels, test_size=0.2, random_state=0)

## Analysis of Variance (Anova)

In [7]:
from scipy.stats import chi2_contingency

In [12]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)



print(X_train.shape)
print(y_train.shape)

(208480, 38)
(208480, 1)


In [8]:
chi2_reults = []

for feat in categorical_columns:
    contingency_table = pd.crosstab(X_train[feat],y_train.squeeze())
    chi2,   p, dof, expected = chi2_contingency(contingency_table)
    chi2_reults.append((feat, chi2, p))

chi2_square_results_df = pd.DataFrame(chi2_reults, columns=["Feature", "Chi-square", "P-value"])

In [9]:
chi2_square_results_df

Unnamed: 0,Feature,Chi-square,P-value
0,land_surface_condition,358.589261,2.4508350000000002e-76
1,foundation_type,38989.276303,0.0
2,roof_type,24275.220527,0.0
3,ground_floor_type,29176.796855,0.0
4,other_floor_type,25379.130262,0.0
5,position,851.454353,1.170333e-180
6,plan_configuration,1460.078797,1.7945110000000001e-299
7,legal_ownership_status,2051.755414,0.0


In [10]:
## Target Encoding

In [11]:
full_data = X_train.join(y_train)
full_data.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
5654,20,281,7097,2,15,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,1
28094,26,886,12157,2,10,6,6,t,w,n,...,0,0,0,0,0,0,0,0,0,1
151910,26,36,1125,1,0,5,3,n,r,n,...,0,0,0,0,0,0,0,0,0,1
53449,13,1372,4555,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
202567,8,206,6064,3,35,6,5,t,r,q,...,0,0,0,0,0,0,0,0,0,3


In [12]:
full_data["geo3_encoded"] = full_data.groupby("geo_level_3_id")["damage_grade"].transform("mean")