In [1]:
import pandas as pd
import numpy as np

In [15]:
# Reading csv files to dataframes
building_use_df= pd.read_csv('data/extracted/eq_data/csv_building_ownership_and_use.csv')
building_structure_df=  pd.read_csv('data/extracted/eq_data/csv_building_structure.csv')

## Building Ownership and Use data

In [16]:
building_use_df.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,120101000011,12,1207,120703,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
1,120101000021,12,1207,120703,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
2,120101000031,12,1207,120703,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
3,120101000041,12,1207,120703,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
4,120101000051,12,1207,120703,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0


In [17]:
building_use_df.columns

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other'],
      dtype='object')

In [18]:
building_use_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762106 entries, 0 to 762105
Data columns (total 17 columns):
building_id                      762106 non-null int64
district_id                      762106 non-null int64
vdcmun_id                        762106 non-null int64
ward_id                          762106 non-null int64
legal_ownership_status           762106 non-null object
count_families                   762104 non-null float64
has_secondary_use                762096 non-null float64
has_secondary_use_agriculture    762106 non-null int64
has_secondary_use_hotel          762106 non-null int64
has_secondary_use_rental         762106 non-null int64
has_secondary_use_institution    762106 non-null int64
has_secondary_use_school         762106 non-null int64
has_secondary_use_industry       762106 non-null int64
has_secondary_use_health_post    762106 non-null int64
has_secondary_use_gov_office     762106 non-null int64
has_secondary_use_use_police     762106 non-null int64
has_

All the features are of numeric dtype except for `lega_ownership_status`. Need to look into this feature and possibly convert to numeric dtype

In [23]:
round(building_use_df['legal_ownership_status'].value_counts(normalize=True)*100,2)

Private          95.97
Public            2.52
Institutional     1.03
Other             0.48
Name: legal_ownership_status, dtype: float64

About 96% of the building structures in the data are of private ownership with 2.5% of public ownership and 1% belonging to institutions. On converting to numeric datatype, we will assign the labels in order as a range from 1 to 4.

In [27]:
conditions = [(building_use_df['legal_ownership_status']=='Private'),
             (building_use_df['legal_ownership_status']== 'Public'),
             (building_use_df['legal_ownership_status']== 'Institutional')]
#              (building_use_df['legal_ownership_status']== 'Other')]

choices = [1,2,3]

building_use_df['legal_ownership_status']= np.select(conditions, choices, default=4)

In [29]:
round(building_use_df['legal_ownership_status'].value_counts(normalize=True)*100,2)

1    95.97
2     2.52
3     1.03
4     0.48
Name: legal_ownership_status, dtype: float64

In [33]:
building_use_df.isna().sum()

building_id                       0
district_id                       0
vdcmun_id                         0
ward_id                           0
legal_ownership_status            0
count_families                    2
has_secondary_use                10
has_secondary_use_agriculture     0
has_secondary_use_hotel           0
has_secondary_use_rental          0
has_secondary_use_institution     0
has_secondary_use_school          0
has_secondary_use_industry        0
has_secondary_use_health_post     0
has_secondary_use_gov_office      0
has_secondary_use_use_police      0
has_secondary_use_other           0
dtype: int64

There are two features with missing values. `count_families` missing 2 values and `has_secondary_use` missing 10. The column `count_families` is a record of the number of families in a building. The column `has_secondary_use` is a flag variable that indicates if the building is used for any secondary purpose, according to the table information on the data source [site](https://eq2015.npc.gov.np/#/download)

In [37]:
building_use_df[building_use_df['count_families'].isna()]

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
83715,203201090341,20,2009,200910,1,,0.0,0,0,0,0,0,0,0,0,0,0
83766,203202000521,20,2009,200910,1,,,0,0,0,0,0,0,0,0,0,0


In [48]:
building_use_df['count_families'].value_counts()

1.0     643418
0.0      71578
2.0      39753
3.0       5685
4.0       1215
5.0        302
6.0        104
7.0         27
8.0         15
9.0          8
11.0         1
Name: count_families, dtype: int64

In [46]:
building_use_df[building_use_df['has_secondary_use'].isna()]

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
83766,203202000521,20,2009,200910,1,0.0,,0,0,0,0,0,0,0,0,0,0
131558,212402000211,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131579,212402000221,21,2107,210703,1,2.0,,0,0,0,0,0,0,0,0,0,0
131627,212402000071,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131629,212402000091,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131640,212402000201,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131652,212402000341,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131654,212402000361,21,2107,210703,1,2.0,,0,0,0,0,0,0,0,0,0,0
131655,212402000371,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0
131929,212404000861,21,2107,210703,1,1.0,,0,0,0,0,0,0,0,0,0,0


In [47]:
building_use_df['has_secondary_use'].value_counts()

0.0    669732
1.0     92364
Name: has_secondary_use, dtype: int64

In [49]:
building_use_df['count_families'].fillna(value=0, inplace=True)
building_use_df['has_secondary_use'].fillna(value=0, inplace=True)

In [53]:
building_use_df.isnull().any()

building_id                      False
district_id                      False
vdcmun_id                        False
ward_id                          False
legal_ownership_status           False
count_families                   False
has_secondary_use                False
has_secondary_use_agriculture    False
has_secondary_use_hotel          False
has_secondary_use_rental         False
has_secondary_use_institution    False
has_secondary_use_school         False
has_secondary_use_industry       False
has_secondary_use_health_post    False
has_secondary_use_gov_office     False
has_secondary_use_use_police     False
has_secondary_use_other          False
dtype: bool

## Building Structure data

In [54]:
building_structure_df.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,...,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,condition_post_eq,damage_grade,technical_solution_proposed
0,120101000011,12,1207,120703,1,1,9,288,9,9,...,0,0,0,1,0,0,0,Damaged-Used in risk,Grade 3,Major repair
1,120101000021,12,1207,120703,1,1,15,364,9,9,...,0,0,0,1,0,0,0,Damaged-Repaired and used,Grade 5,Reconstruction
2,120101000031,12,1207,120703,1,1,20,384,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
3,120101000041,12,1207,120703,1,1,20,312,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
4,120101000051,12,1207,120703,1,1,30,308,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 1,Minor repair


In [55]:
building_structure_df.columns

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed'],
      dtype='object')

In [56]:
building_structure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762106 entries, 0 to 762105
Data columns (total 31 columns):
building_id                               762106 non-null int64
district_id                               762106 non-null int64
vdcmun_id                                 762106 non-null int64
ward_id                                   762106 non-null int64
count_floors_pre_eq                       762106 non-null int64
count_floors_post_eq                      762106 non-null int64
age_building                              762106 non-null int64
plinth_area_sq_ft                         762106 non-null int64
height_ft_pre_eq                          762106 non-null int64
height_ft_post_eq                         762106 non-null int64
land_surface_condition                    762106 non-null object
foundation_type                           762106 non-null object
roof_type                                 762106 non-null object
ground_floor_type                         762106 non-n

In [57]:
building_structure_df['land_surface_condition'].value_counts()

Flat              631675
Moderate slope    105640
Steep slope        24791
Name: land_surface_condition, dtype: int64