In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv(r"C:\Users\bindu\Desktop\nepal_earthquake\train_values.csv")
train_df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [3]:
labels = pd.read_csv(r"C:\Users\bindu\Desktop\nepal_earthquake\train_labels.csv")
labels = labels-1
labels.head()

Unnamed: 0,building_id,damage_grade
0,802905,2
1,28829,1
2,94946,2
3,590881,1
4,201943,2


In [4]:
test_df = pd.read_csv(r"C:\Users\bindu\Desktop\nepal_earthquake\test_values.csv")
test_df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train = train_df.merge(labels,left_on= 'building_id',right_on='building_id')

In [6]:
categorical_columns=['land_surface_condition','foundation_type','roof_type',
                                        'legal_ownership_status','ground_floor_type','other_floor_type','position','plan_configuration'] #'legal_ownership_status',

In [7]:
for col in categorical_columns:
    train[f'{col}_encoded'] = train[col].astype('category').cat.codes

for col in categorical_columns:
    test_df[f'{col}_encoded'] = test_df[col].astype('category').cat.codes

In [8]:
train.drop(columns=categorical_columns,inplace=True)
test_df.drop(columns=categorical_columns,inplace=True)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

In [10]:
# Dummy input (batch_size=5, geo_level_1_id, geo_level_2_id, geo_level_3_id)
input_geo_data = torch.tensor(train[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].to_numpy(), dtype=torch.long)

# Sample labels for damage grades (output classes, e.g., damage grade 1, 2, 3)
labels = torch.tensor(train['damage_grade'].to_numpy(), dtype=torch.long)

train.drop(columns='damage_grade',inplace=True)


In [35]:

class_weights = torch.tensor([3.45, 0.59, 0.99])  
#criterion = nn.CrossEntropyLoss()
#{1: 3.441821852417879, 2: 0.5856029379662802, 3: 0.9981887924005888}

In [24]:
class GeospatialEmbeddingModel(nn.Module):
    def __init__(self, geo_lv1_size, geo_lv2_size, geo_lv3_size, latent_dim):
        super(GeospatialEmbeddingModel, self).__init__()
        # Embedding layers
        self.geo_level1_embedding = nn.Embedding(geo_lv1_size, 16)
        self.geo_level2_embedding = nn.Embedding(geo_lv2_size, 128)
        self.geo_level3_embedding = nn.Embedding(geo_lv3_size, 128)

        # Initialize embeddings
        nn.init.uniform_(self.geo_level1_embedding.weight, -0.1, 0.1)
        nn.init.uniform_(self.geo_level2_embedding.weight, -0.1, 0.1)
        nn.init.uniform_(self.geo_level3_embedding.weight, -0.1, 0.1)

        # Compressor layer
        self.compressor = nn.Linear(16 + 128 + 128, latent_dim)
        
        # Batch Normalization
        self.batch_norm = nn.BatchNorm1d(latent_dim)
        
        # Output layer
        self.output = nn.Linear(latent_dim, 3)

    def forward(self, x):
        geo1_embedded = self.geo_level1_embedding(x[:, 0])
        geo2_embedded = self.geo_level2_embedding(x[:, 1])
        geo3_embedded = self.geo_level3_embedding(x[:, 2])

        concatenated = torch.cat([geo1_embedded, geo2_embedded, geo3_embedded], dim=1)
        compressed = torch.relu(self.compressor(concatenated))
        compressed = self.batch_norm(compressed)
        output = self.output(compressed)
        return output

# Example data sizes
geo_level_1_size, geo_level_2_size, geo_level_3_size = 31, 1428, 12568
latent_dim = 16

# Instantiate model
embds_model = GeospatialEmbeddingModel(geo_level_1_size, geo_level_2_size, geo_level_3_size, latent_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(weight = torch.tensor([3.45, 0.59, 0.99]))  # Optionally, apply class weights
optimizer = optim.Adam(embds_model.parameters(), lr=0.0001)

# Training loop
num_epochs = 100  # Train for enough epochs
for epoch in range(num_epochs):
    #for inputs, labels in data_loader:  # Assume data_loader is defined
    optimizer.zero_grad()  # Zero the gradients
    outputs = embds_model(input_geo_data)  # Forward pass
    loss = criterion(outputs, labels)  # Compute loss
    loss.backward()  # Backward pass
    optimizer.step()  # Update weights
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/100], Loss: 1.2209
Epoch [2/100], Loss: 1.2148
Epoch [3/100], Loss: 1.2088
Epoch [4/100], Loss: 1.2030
Epoch [5/100], Loss: 1.1973
Epoch [6/100], Loss: 1.1917
Epoch [7/100], Loss: 1.1863
Epoch [8/100], Loss: 1.1810
Epoch [9/100], Loss: 1.1758
Epoch [10/100], Loss: 1.1708
Epoch [11/100], Loss: 1.1658
Epoch [12/100], Loss: 1.1610
Epoch [13/100], Loss: 1.1563
Epoch [14/100], Loss: 1.1517
Epoch [15/100], Loss: 1.1473
Epoch [16/100], Loss: 1.1429
Epoch [17/100], Loss: 1.1387
Epoch [18/100], Loss: 1.1345
Epoch [19/100], Loss: 1.1305
Epoch [20/100], Loss: 1.1266
Epoch [21/100], Loss: 1.1228
Epoch [22/100], Loss: 1.1191
Epoch [23/100], Loss: 1.1155
Epoch [24/100], Loss: 1.1120
Epoch [25/100], Loss: 1.1086
Epoch [26/100], Loss: 1.1052
Epoch [27/100], Loss: 1.1020
Epoch [28/100], Loss: 1.0988
Epoch [29/100], Loss: 1.0958
Epoch [30/100], Loss: 1.0928
Epoch [31/100], Loss: 1.0898
Epoch [32/100], Loss: 1.0870
Epoch [33/100], Loss: 1.0842
Epoch [34/100], Loss: 1.0815
Epoch [35/100], Loss: 1

In [25]:
import numpy as np
train_2 = train.iloc[:,4:].to_numpy()
final_1 = np.hstack((outputs.detach().numpy(), train_2))

In [26]:
final_1.shape

(64854, 38)

In [27]:
# Define a simple feed-forward neural network
class ClassificationModel(nn.Module):
    def __init__(self,input_dim):
        super(ClassificationModel, self).__init__()
        self.linear_relu_norm = nn.Sequential(
            nn.Linear(input_dim, 32),  # First hidden layer
            nn.ReLU(),
            nn.BatchNorm1d(32),        # Normalize the layer
            nn.Dropout(0.3),           # Dropout layer to prevent overfitting
            nn.Linear(32, 16),         # Second hidden layer
            nn.ReLU(),
            nn.Linear(16, 3)  # Output layer
        )
        
    
    def forward(self, x):
        logits = self.linear_relu_norm(x)
        return logits

# Example: Input size is 10, hidden layer size is 64, output i
# s 3 classes
input_dim = final_1.shape[1]

classfication_model = ClassificationModel(input_dim)


In [28]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Optionally, apply class weights
optimizer = optim.Adam(classfication_model.parameters(), lr=0.0001)

# Training loop
num_epochs = 100  # Train for enough epochs
for epoch in range(num_epochs):
    #for inputs, labels in data_loader:  # Assume data_loader is defined
    optimizer.zero_grad()  # Zero the gradients
    logits = classfication_model(torch.tensor(final_1,dtype=torch.float32))  # Forward pass
    loss = criterion(logits, labels)  # Compute loss
    loss.backward()  # Backward pass
    optimizer.step()  # Update weights
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/100], Loss: 1.1752
Epoch [2/100], Loss: 1.1761
Epoch [3/100], Loss: 1.1748
Epoch [4/100], Loss: 1.1745
Epoch [5/100], Loss: 1.1747
Epoch [6/100], Loss: 1.1739
Epoch [7/100], Loss: 1.1727
Epoch [8/100], Loss: 1.1726
Epoch [9/100], Loss: 1.1718
Epoch [10/100], Loss: 1.1718
Epoch [11/100], Loss: 1.1709
Epoch [12/100], Loss: 1.1705
Epoch [13/100], Loss: 1.1691
Epoch [14/100], Loss: 1.1691
Epoch [15/100], Loss: 1.1684
Epoch [16/100], Loss: 1.1686
Epoch [17/100], Loss: 1.1678
Epoch [18/100], Loss: 1.1666
Epoch [19/100], Loss: 1.1669
Epoch [20/100], Loss: 1.1661
Epoch [21/100], Loss: 1.1653
Epoch [22/100], Loss: 1.1648
Epoch [23/100], Loss: 1.1643
Epoch [24/100], Loss: 1.1643
Epoch [25/100], Loss: 1.1634
Epoch [26/100], Loss: 1.1628
Epoch [27/100], Loss: 1.1633
Epoch [28/100], Loss: 1.1627
Epoch [29/100], Loss: 1.1618
Epoch [30/100], Loss: 1.1614
Epoch [31/100], Loss: 1.1607
Epoch [32/100], Loss: 1.1610
Epoch [33/100], Loss: 1.1600
Epoch [34/100], Loss: 1.1590
Epoch [35/100], Loss: 1

In [29]:
input_geo_data2 = torch.tensor(test_df[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].to_numpy(), dtype=torch.long)
outputs2 = embds_model(input_geo_data2)

In [30]:
test_2 = test_df.iloc[:,4:].to_numpy()
final_2 = np.hstack((outputs2.detach().numpy(), test_2))

In [31]:
final_2.shape

(86868, 38)

In [32]:
test_logits = classfication_model(torch.tensor(final_2,dtype=torch.float32))

In [33]:
pred_probab = nn.Softmax(dim=1)(test_logits)
y_pred = pred_probab.argmax(1)

In [37]:
from collections import Counter
Counter(y_pred.numpy())

Counter({2: 65468, 0: 20447, 1: 953})

In [None]:
sub2 = pd.DataFrame({'building_id' : test_df['building_id'],'damage_grade' : y_pred+1})
sub2.head()

In [None]:
from collections import Counter
Counter(y_pred.numpy())

In [23]:
sub2.to_csv("geo_embdngs_nn_1.csv",index=False)

In [53]:
import numpy as np
preds = np.argmax(outputs2.detach().numpy(),axis=1)

In [None]:
preds+1

In [None]:
cnt = 0
for i in range(len(labels)):
    if preds[i] + 1 != labels[i]:
        cnt += 1
cnt

In [None]:
len(labels)

In [11]:
torch.save(outputs, 'output_tensor.pt')

In [None]:
loaded_tensor = torch.load('output_tensor.pt')
loaded_tensor.shape

In [None]:
geo_embds = loaded_tensor.detach().numpy()
geo_embds.shape

In [127]:
corr_mtrx = train.corr()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=[20,20])
sns.heatmap(corr_mtrx,annot=True,cmap='Reds')
plt.title("Correlation between Variables")
plt.show()

In [None]:
test_2 = test_df.iloc[:,4:].to_numpy()
geo_embds_2 = outputs2.detach().numpy()
final_2 = np.hstack((geo_embds_2, test_2))
final_2.shape

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y = y
X = final_1

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight(class_weight='balanced', 
                                     classes=np.unique(y_train), 
                                     y=y_train)
class_weight_dict = {}
for i in range(1,len(class_weights)+1):
    class_weight_dict[i] = class_weights[i-1]

class_weight_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize and train model
model = RandomForestClassifier(n_estimators=100,min_samples_leaf=5 )#, class_weight=class_weight_dict)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred)}')

f1_micro = f1_score(y_test, y_pred, average='micro')
f1_micro

In [None]:
# import XGBClassifier
from xgboost import XGBClassifier


# declare parameters
params = {
            'objective':'multi:softmax',
            'max_depth': 6,
            'n_estimators':100,
            'num_classes':3,
            'eval_metric' : 'merror' 
        }
            
            
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)



# fit the classifier to the training data
xgb_clf.fit(X,y-1)

In [112]:
# make predictions on test data
y_pred = xgb_clf.predict(final_2)

In [None]:
f1_micro = f1_score(y_test-1, y_pred, average='micro')
f1_micro

In [116]:
sub2.to_csv("geo_embdngs_xgb_1.csv",index=False)

In [None]:
train_df.info()

In [None]:
# legal_ownership_status
# land_surface_condition  
# foundation_type         
# roof_type               
# ground_floor_type       
# other_floor_type        
# position                
# plan_configuration          
set(train_df['plan_configuration'])

In [36]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from collections import Counter
xx = Counter(train['geo_level_1_id'])
xx

In [None]:
# Group by 'damage_grade' and sum or average 'count_floors_pre_eq'
geo_damage = train.groupby(['geo_level_1_id','damage_grade'])['building_id'].count().reset_index()
geo_damage

In [None]:
max_geo_damage = geo_damage.loc[geo_damage.groupby('geo_level_1_id')['building_id'].idxmax()]
max_geo_damage

In [None]:
max_geo_damage[max_geo_damage['damage_grade'] == 3]

In [None]:
# Group by 'damage_grade' and sum or average 'count_floors_pre_eq'
geo_damage2 = train.groupby(['geo_level_2_id','damage_grade'])['building_id'].count().reset_index()
geo_damage2

In [None]:
max_geo_damage2 = geo_damage2.loc[geo_damage2.groupby('geo_level_2_id')['building_id'].idxmax()]
max_geo_damage2

In [None]:
max_geo_damage2[max_geo_damage2['damage_grade'] == 3]

In [23]:
geo12 = train.groupby(['geo_level_1_id','geo_level_2_id'])['building_id'].count().reset_index()

In [None]:
geo12[geo12['geo_level_2_id'] == 21] #8,17,18,21,27

In [None]:
concrete_damage = train.groupby(['has_superstructure_rc_non_engineered','damage_grade'])['building_id'].count().reset_index()

concrete_damage

In [None]:
concrete_damage2 = train.groupby(['has_superstructure_rc_engineered','damage_grade'])['building_id'].count().reset_index()

concrete_damage2

In [None]:
concrete_damage3 = train.groupby(['has_superstructure_other','damage_grade'])['building_id'].count().reset_index()

concrete_damage3

In [None]:
floors_damage = train.groupby(['count_floors_pre_eq','damage_grade'])['building_id'].count().reset_index()

floors_damage

In [None]:
g = sns.catplot(
    data=floors_damage, kind="bar",
    x="count_floors_pre_eq", y="building_id", hue="damage_grade",
    errorbar="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Number of floors", "Count of buildings")
g.legend.set_title("Relation of number of floors to Damage")

In [88]:
# Group by 'damage_grade' and sum or average 'count_floors_pre_eq'
age_damage = train.groupby(['age','damage_grade'])['building_id'].count().reset_index()

age_damage = age_damage[age_damage['age'] < 500]

In [None]:
plt.scatter(age_damage['age'],age_damage['building_id'],c=age_damage['damage_grade'])
plt.colorbar(label='Color intensity')

In [None]:
plt.scatter(train['height_percentage'],train['area_percentage'],c=train['damage_grade'])
plt.colorbar(label='Color intensity')

In [94]:
# Group by 'damage_grade' and sum or average 'count_floors_pre_eq'
area_damage = train.groupby(['area_percentage','damage_grade'])['building_id'].count().reset_index()


In [None]:
plt.scatter(area_damage['area_percentage'],area_damage['building_id'],c=area_damage['damage_grade'])
plt.colorbar(label='Color intensity')

In [96]:
# Group by 'damage_grade' and sum or average 'count_floors_pre_eq'
height_damage = train.groupby(['height_percentage','damage_grade'])['building_id'].count().reset_index()


In [None]:
plt.scatter(height_damage['height_percentage'],height_damage['building_id'],c=height_damage['damage_grade'])
plt.colorbar(label='Color intensity')

In [6]:
categorical_columns=['legal_ownership_status','land_surface_condition','foundation_type','roof_type',
                                        'ground_floor_type','other_floor_type','position','plan_configuration']

In [7]:
for col in categorical_columns:
    train[f'{col}_encoded'] = train[col].astype('category').cat.codes

In [8]:
train.drop(columns=categorical_columns,inplace=True)

In [None]:
train.shape

In [None]:
test = pd.read_csv("test_values.csv")
test.head()

In [11]:
for col in categorical_columns:
    test[f'{col}_encoded'] = test[col].astype('category').cat.codes

In [12]:
test.drop(columns=categorical_columns,inplace=True)

In [None]:
test.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

y = train['damage_grade']
X = train.drop(columns = ['damage_grade'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f'Decision Tree Accuracy: {accuracy_score(y_test, y_pred)}')

f1_micro = f1_score(y_test, y_pred, average='micro')
f1_micro


In [32]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train model
model = RandomForestClassifier()
model.fit(X, y)

# Predict and evaluate
y_pred = model.predict(test)
# print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred)}')

# f1_micro = f1_score(y_test, y_pred, average='micro')
# f1_micro

In [None]:
sub1 = pd.DataFrame({'building_id' : test['building_id'],'damage_grade' : y_pred})
sub1.head()

In [35]:
sub1.to_csv("rf_predictions.csv",index=False)

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train model
model = GradientBoostingClassifier()
model.fit(X, y)

# Predict and evaluate
y_pred = model.predict(test)
# print(f'Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred)}')

# f1_micro = f1_score(y_test, y_pred, average='micro')
# f1_micro


In [None]:
sub3 = pd.DataFrame({'building_id' : test['building_id'],'damage_grade' : y_pred})
sub3.head()

In [41]:
sub3.to_csv("gbc_predictions.csv",index=False)

In [None]:
import lightgbm as lgb

# Initialize and train model
model = lgb.LGBMClassifier()
model.fit(X, y)

# Predict and evaluate
y_pred = model.predict(test)
# print(f'LightGBM Accuracy: {accuracy_score(y_test, y_pred)}')

# f1_micro = f1_score(y_test, y_pred, average='micro')
# f1_micro

In [None]:
sub2 = pd.DataFrame({'building_id' : test['building_id'],'damage_grade' : y_pred})
sub2.head()

In [38]:
sub2.to_csv("lgb_predictions.csv",index=False)