In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from pathlib import Path
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
import seaborn as sns

## Load the data

In [2]:
train_X = pd.read_csv('../data/train_values.csv', index_col='building_id')
train_y = pd.read_csv('../data/train_labels.csv', index_col='building_id')

X_test = pd.read_csv('../data/test_values.csv', index_col='building_id')

## Explore Features

In [3]:
train_X[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = train_X[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)
X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)

train_X[['count_floors_pre_eq','age','area_percentage','height_percentage']] = train_X[['count_floors_pre_eq','age','area_percentage','height_percentage']].astype(float)

train_X['volume_percentage']=train_X['area_percentage'] * train_X['height_percentage']
X_test['volume_percentage']=X_test['area_percentage'] * X_test['height_percentage']

# Categorical columns 
categorical_columns = [c for c in train_X.select_dtypes(include=['object'])]
numerical_columns= list(set(train_X.columns) - set(categorical_columns))

In [4]:
# Drop building_id (index) from X and y
train_X.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)

In [5]:
duplicate_index_mask = train_X.index.duplicated(keep='first')
X = train_X[~duplicate_index_mask]
y = train_y[~duplicate_index_mask]

## Split the data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from scipy.stats import chi2_contingency


In [8]:
chi2_reults = []

for feat in categorical_columns:
    contingency_table = pd.crosstab(X_train[feat],y_train.squeeze())
    chi2,   p, dof, expected = chi2_contingency(contingency_table)
    chi2_reults.append((feat, chi2, p))

chi2_square_results_df = pd.DataFrame(chi2_reults, columns=["Feature", "Chi-square", "P-value"])

In [9]:
chi2_square_results_df

Unnamed: 0,Feature,Chi-square,P-value
0,geo_level_1_id,58148.229423,0.0
1,geo_level_2_id,107939.61177,0.0
2,geo_level_3_id,149625.719464,0.0
3,land_surface_condition,358.589261,2.4508350000000002e-76
4,foundation_type,38989.276303,0.0
5,roof_type,24275.220527,0.0
6,ground_floor_type,29176.796855,0.0
7,other_floor_type,25379.130262,0.0
8,position,851.454353,1.170333e-180
9,plan_configuration,1460.078797,1.7945110000000001e-299


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming chi2_square_results_df is your DataFrame
chi2_square_results_df = pd.DataFrame(chi2_reults, columns=["Feature", "Chi-square", "P-value"])

# Create subplots: one for Chi-square values and one for P-values
fig = make_subplots(rows=2, cols=1, subplot_titles=("Chi-square values", "P-values"))

# Bar plot for Chi-square values
fig.add_trace(
    go.Bar(x=chi2_square_results_df["Feature"], y=chi2_square_results_df["Chi-square"], name="Chi-square"),
    row=1, col=1
)

# Scatter plot for P-values
fig.add_trace(
    go.Scatter(x=chi2_square_results_df["Feature"], y=chi2_square_results_df["P-value"], mode="markers", name="P-value"),
    row=2, col=1
)

# Update layout
fig.update_layout(height=600, width=800, title_text="Chi-square Analysis Results")
fig.update_yaxes(title_text="Chi-square Value", row=1, col=1)
fig.update_yaxes(title_text="P-value", row=2, col=1)
fig.update_xaxes(title_text="Features", row=2, col=1)

# Show plot
fig.show()
