In [1]:
!pip install torch_geometric

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [20]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial import KDTree
import torch.nn as nn
import torch.nn.functional as F

# ---------------------
# Load and Prepare Data
# ---------------------
df = pd.read_csv("preprocessed_mining_data.csv")

# Define target (oil mine presence)
# conditions = (
#     (df['P Wave Velocity (km/s)'] < 4.5) &
#     (df['S Wave Velocity (km/s)'] < 2.5) &
#     (df['Carbon Emission (ppm)'] > 350) &
#     (df['hrock_type'].isin([2, 3]))
# )
# df['oil_mine_presence'] = np.where(conditions, 1, 0)

# Replace your original condition with this:
conditions = (
    (df['P Wave Velocity (km/s)'] < 5.0) &
    (df['S Wave Velocity (km/s)'] < 3.0) &
    (df['Carbon Emission (ppm)'] > 300) &
    (df['hrock_type'].isin([1, 2, 3]))  # allow one more rock type
)
df['oil_mine_presence'] = np.where(conditions, 1, 0)


# Encode categorical features
categorical_cols = ['hrock_type', 'arock_type', 'structure', 'orebody_fm']
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col])

# Feature selection
features = ['latitude', 'longitude', 'P Wave Velocity (km/s)', 'S Wave Velocity (km/s)',
            'Humidity (%)', 'Carbon Emission (ppm)', 'hrock_type', 'arock_type',
            'structure', 'orebody_fm']
df_features = df[features]
df_target = df['oil_mine_presence']

# Normalize features
scaler = StandardScaler()
df_features = pd.DataFrame(scaler.fit_transform(df_features), columns=features)
nodes_np = df_features.values
y = torch.tensor(df_target.values, dtype=torch.long)

# ---------------------
# Graph Construction
# ---------------------
tree = KDTree(nodes_np)
k = 10
distances, indices = tree.query(nodes_np, k=k)

edges = []
for i, neighbors in enumerate(indices):
    for j in neighbors:
        if i != j:
            edges.append((i, j))

# Convert edges to tensor format for PyTorch
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
x = torch.tensor(nodes_np, dtype=torch.float)

# ---------------------
# Custom GAT Layer
# ---------------------
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2):
        super(GATLayer, self).__init__()
        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
        self.leakyrelu = nn.LeakyReLU(alpha)
        self.dropout = nn.Dropout(dropout)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W.data)
        nn.init.xavier_uniform_(self.a.data)

    def forward(self, x, edge_index):
        h = torch.mm(x, self.W)  # [N, out_features]
        row, col = edge_index
        edge_h = torch.cat([h[row], h[col]], dim=1)  # [E, 2*out_features]
        e = self.leakyrelu(torch.matmul(edge_h, self.a)).squeeze()  # [E]
        alpha = F.softmax(e, dim=0)
        alpha = self.dropout(alpha)
        h_prime = torch.zeros_like(h)
        h_prime.index_add_(0, row, alpha.unsqueeze(1) * h[col])
        return F.elu(h_prime)

# ---------------------
# GAT Model
# ---------------------
class GATNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GATNet, self).__init__()
        self.gat1 = GATLayer(input_dim, hidden_dim)
        self.gat2 = GATLayer(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = self.gat2(x, edge_index)
        return x

# ---------------------
# Prepare Train/Test Split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# ---------------------
# Instantiate Model
# ---------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GATNet(input_dim=x.shape[1], hidden_dim=8, output_dim=2).to(device)

x = x.to(device)
edge_index = edge_index.to(device)
y = y.to(device)

# Example forward pass (not training yet)
out = model(x, edge_index)
pred = out.argmax(dim=1)

# Accuracy check
acc = (pred == y).sum().item() / y.size(0)
print("Initial Accuracy (no training):", acc)


Initial Accuracy (no training): 0.4899180156757065


In [21]:
print(df['oil_mine_presence'].value_counts())

oil_mine_presence
0    210772
Name: count, dtype: int64


In [3]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial import KDTree
import torch.nn as nn
import torch.nn.functional as F

# --------------------- Load and Prepare Data ---------------------
df = pd.read_csv("preprocessed_mining_data.csv")

# Define target (oil mine presence)
conditions = (
    (df['P Wave Velocity (km/s)'] < 4.5) &
    (df['S Wave Velocity (km/s)'] < 2.5) &
    (df['Carbon Emission (ppm)'] > 350) &
    (df['hrock_type'].isin([2, 3]))
)
df['oil_mine_presence'] = np.where(conditions, 1, 0)

# Encode categorical features
categorical_cols = ['hrock_type', 'arock_type', 'structure', 'orebody_fm']
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col])

# Feature selection
features = ['latitude', 'longitude', 'P Wave Velocity (km/s)', 'S Wave Velocity (km/s)',
            'Humidity (%)', 'Carbon Emission (ppm)', 'hrock_type', 'arock_type',
            'structure', 'orebody_fm']
df_features = df[features]
df_target = df['oil_mine_presence']

# Normalize features
scaler = StandardScaler()
df_features = pd.DataFrame(scaler.fit_transform(df_features), columns=features)
nodes_np = df_features.values
y = torch.tensor(df_target.values, dtype=torch.long)

In [4]:
# --------------------- Graph Construction ---------------------
tree = KDTree(nodes_np)
k = 10
distances, indices = tree.query(nodes_np, k=k)

edges = []
for i, neighbors in enumerate(indices):
    for j in neighbors:
        if i != j:
            edges.append((i, j))

# Convert edges to tensor format
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
x = torch.tensor(nodes_np, dtype=torch.float)

In [5]:
num_nodes = x.shape[0]
num_edges = edge_index.shape[1]
num_heads = 4  # your current GATNet setting

print(f"Total nodes: {num_nodes}")
print(f"Total edges: {num_edges}")
print(f"Total attention computations per forward pass: {num_edges * num_heads}")


Total nodes: 210772
Total edges: 1896948
Total attention computations per forward pass: 7587792


In [6]:
density = num_edges / (num_nodes * (num_nodes - 1))
print(f"Edge density: {density:.6f}")


Edge density: 0.000043


In [7]:
from torch_geometric.utils import softmax

In [8]:
# --------------------- Custom GAT Layer ---------------------
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super(GATLayer, self).__init__()
        self.dropout = dropout
        self.concat = concat
        self.in_features = in_features
        self.out_features = out_features

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(alpha)

    def forward(self, h, edge_index):
        Wh = torch.mm(h, self.W)  # [N, out_features]

        # Compute attention coefficients e_ij for each edge
        Wh_i = Wh[edge_index[0]]  # source node features
        Wh_j = Wh[edge_index[1]]  # target node features
        edge_input = torch.cat([Wh_i, Wh_j], dim=1)  # [E, 2*out_features]

        e = self.leakyrelu(torch.matmul(edge_input, self.a).squeeze(-1))  # [E]

        # Normalize coefficients per source node using softmax
        e = F.dropout(e, self.dropout, training=self.training)
        # alpha = F.softmax(e, index=edge_index[0], dim=0)  # ⬅️ normalize per source node
        alpha = softmax(e, edge_index[0])

        # Message passing: weighted sum of Wh_j using alpha
        h_prime = torch.zeros_like(Wh)
        h_prime = h_prime.index_add(0, edge_index[0], alpha.unsqueeze(1) * Wh_j)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

In [9]:
# --------------------- GAT Model ---------------------
class GATNet(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, dropout=0.6, alpha=0.2, heads=4):
        super(GATNet, self).__init__()
        self.dropout = dropout
        self.attentions = nn.ModuleList([
            GATLayer(in_features, hidden_features, dropout=dropout, alpha=alpha, concat=True)
            for _ in range(heads)
        ])
        self.out_att = GATLayer(hidden_features * heads, out_features, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, edge_index):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, edge_index) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.out_att(x, edge_index)
        return F.log_softmax(x, dim=1)


In [10]:
# --------------------- Train/Test Split ---------------------
data = Data(x=x, edge_index=edge_index, y=y)
num_nodes = data.num_nodes
train_mask, test_mask = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42, stratify=y)

train_mask = torch.tensor(train_mask, dtype=torch.long)
test_mask = torch.tensor(test_mask, dtype=torch.long)

# --------------------- Training Setup ---------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GATNet(in_features=x.shape[1], hidden_features=8, out_features=2, heads=4).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = nn.NLLLoss()

In [11]:
# --------------------- Training Loop ---------------------
for epoch in range(1, 70):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()

    # model.eval()
    # _, pred = out[test_mask].max(dim=1)
    # correct = pred.eq(data.y[test_mask]).sum().item()
    # acc = correct / test_mask.size(0)

    # if epoch % 10 == 0 or epoch == 1:
    #     print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Accuracy: {acc:.4f}")

    model.eval()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[train_mask], data.y[train_mask])

    # Predictions
    log_probs = out[test_mask]
    prob = log_probs.exp()  # [num_test_samples, num_classes]
    _, pred = prob.max(dim=1)
    correct = pred.eq(data.y[test_mask]).sum().item()
    acc = correct / test_mask.size(0)

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Accuracy: {acc:.4f}")


Epoch 001 | Loss: 1.5443 | Test Accuracy: 0.4536
Epoch 010 | Loss: 1.0208 | Test Accuracy: 0.5295
Epoch 020 | Loss: 0.6928 | Test Accuracy: 0.6745
Epoch 030 | Loss: 0.5462 | Test Accuracy: 0.8310
Epoch 040 | Loss: 0.4674 | Test Accuracy: 0.8804
Epoch 050 | Loss: 0.4182 | Test Accuracy: 0.8984
Epoch 060 | Loss: 0.3861 | Test Accuracy: 0.9074


In [12]:
# import numpy as np
# import pandas as pd
# import torch
# from torch_geometric.data import Data
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.model_selection import train_test_split
# from scipy.spatial import KDTree
# import torch.nn as nn
# import torch.nn.functional as F

# # --------------------- Load and Prepare Data ---------------------
# df = pd.read_csv("preprocessed_mining_data.csv")

# # Define target (oil mine presence)
# conditions = (
#     (df['P Wave Velocity (km/s)'] < 4.5) &
#     (df['S Wave Velocity (km/s)'] < 2.5) &
#     (df['Carbon Emission (ppm)'] > 350) &
#     (df['hrock_type'].isin([2, 3]))
# )
# df['oil_mine_presence'] = np.where(conditions, 1, 0)

# # Encode categorical features
# categorical_cols = ['hrock_type', 'arock_type', 'structure', 'orebody_fm']
# label_encoders = {col: LabelEncoder() for col in categorical_cols}
# for col in categorical_cols:
#     df[col] = label_encoders[col].fit_transform(df[col])

# # Feature selection
# features = ['latitude', 'longitude', 'P Wave Velocity (km/s)', 'S Wave Velocity (km/s)',
#             'Humidity (%)', 'Carbon Emission (ppm)', 'hrock_type', 'arock_type',
#             'structure', 'orebody_fm']
# df_features = df[features]
# df_target = df['oil_mine_presence']

# # Normalize features
# scaler = StandardScaler()
# df_features = pd.DataFrame(scaler.fit_transform(df_features), columns=features)
# nodes_np = df_features.values
# y = torch.tensor(df_target.values, dtype=torch.long)

# # --------------------- Graph Construction ---------------------
# tree = KDTree(nodes_np)
# k = 10
# distances, indices = tree.query(nodes_np, k=k)

# edges = []
# for i, neighbors in enumerate(indices):
#     for j in neighbors:
#         if i != j:
#             edges.append((i, j))

# # Convert edges to tensor format
# edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
# x = torch.tensor(nodes_np, dtype=torch.float)

# # --------------------- Custom GAT Layer ---------------------
# class GATLayer(nn.Module):
#     def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
#         super(GATLayer, self).__init__()
#         self.dropout = dropout
#         self.in_features = in_features
#         self.out_features = out_features
#         self.concat = concat

#         self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
#         self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
#         nn.init.xavier_uniform_(self.W.data, gain=1.414)
#         nn.init.xavier_uniform_(self.a.data, gain=1.414)

#         self.leakyrelu = nn.LeakyReLU(alpha)

#     def forward(self, h, edge_index):
#         Wh = torch.mm(h, self.W)
#         edge_h = torch.cat((Wh[edge_index[0]], Wh[edge_index[1]]), dim=1)
#         e = self.leakyrelu(torch.matmul(edge_h, self.a).squeeze(1))

#         # Normalize attention coefficients
#         e = F.dropout(e, self.dropout, training=self.training)
#         attention = torch.zeros(h.size(0), h.size(0), device=h.device)
#         attention[edge_index[0], edge_index[1]] = e
#         attention = F.softmax(attention, dim=1)

#         h_prime = torch.matmul(attention, Wh)

#         if self.concat:
#             return F.elu(h_prime)
#         else:
#             return h_prime

# # --------------------- GAT Model ---------------------
# class GATNet(nn.Module):
#     def __init__(self, in_features, hidden_features, out_features, dropout=0.6, alpha=0.2, heads=4):
#         super(GATNet, self).__init__()
#         self.dropout = dropout
#         self.attentions = nn.ModuleList([
#             GATLayer(in_features, hidden_features, dropout=dropout, alpha=alpha, concat=True)
#             for _ in range(heads)
#         ])
#         self.out_att = GATLayer(hidden_features * heads, out_features, dropout=dropout, alpha=alpha, concat=False)

#     def forward(self, x, edge_index):
#         x = F.dropout(x, self.dropout, training=self.training)
#         x = torch.cat([att(x, edge_index) for att in self.attentions], dim=1)
#         x = F.dropout(x, self.dropout, training=self.training)
#         x = self.out_att(x, edge_index)
#         return F.log_softmax(x, dim=1)

# # --------------------- Train/Test Split ---------------------
# data = Data(x=x, edge_index=edge_index, y=y)
# num_nodes = data.num_nodes
# train_mask, test_mask = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42, stratify=y)

# train_mask = torch.tensor(train_mask, dtype=torch.long)
# test_mask = torch.tensor(test_mask, dtype=torch.long)

# # --------------------- Training Setup ---------------------
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = GATNet(in_features=x.shape[1], hidden_features=8, out_features=2, heads=4).to(device)
# data = data.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
# loss_fn = nn.NLLLoss()

# # --------------------- Training Loop ---------------------
# for epoch in range(1, 70):
#     model.train()
#     optimizer.zero_grad()
#     out = model(data.x, data.edge_index)
#     loss = loss_fn(out[train_mask], data.y[train_mask])
#     loss.backward()
#     optimizer.step()

#     model.eval()
#     _, pred = out[test_mask].max(dim=1)
#     correct = pred.eq(data.y[test_mask]).sum().item()
#     acc = correct / test_mask.size(0)

#     if epoch % 10 == 0 or epoch == 1:
#         print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Accuracy: {acc:.4f}")


In [13]:
# Initialize prediction columns
df['predicted_oil_presence'] = np.nan
df['prediction_probability'] = np.nan

# Fill predictions for test set
df.loc[test_mask.cpu().numpy(), 'predicted_oil_presence'] = pred.cpu().numpy()
df.loc[test_mask.cpu().numpy(), 'prediction_probability'] = prob.max(dim=1).values.detach().cpu().numpy()

# Save to CSV
df[['latitude', 'longitude', 'predicted_oil_presence', 'prediction_probability']].to_csv("oil_predictions.csv", index=False)


In [14]:
# # --- Export Predictions ---
# # df['predicted_oil_presence'] = pred.cpu().numpy()  # Ensure 'pred' is defined
# # df['prediction_probability'] = prob.cpu().numpy()  # Ensure 'prob' is defined
# # df[['latitude', 'longitude', 'predicted_oil_presence', 'prediction_probability']].to_csv("oil_predictions.csv", index=False)
# # --- Export Predictions ---
# # Create a new column initialized to NaN
# df['predicted_oil_presence'] = np.nan
# df['prediction_probability'] = np.nan

# # Assign predicted values and probabilities only to the test set
# df.loc[test_mask.numpy(), 'predicted_oil_presence'] = pred.cpu().numpy()  # Ensure 'pred' is defined
# df.loc[test_mask.numpy(), 'prediction_probability'] = prob.cpu().numpy()  # Ensure 'prob' is defined

# # Save the relevant columns to CSV
# df[['latitude', 'longitude', 'predicted_oil_presence', 'prediction_probability']].to_csv("oil_predictions.csv", index=False)



visualization attention matrix

In [17]:
print("True label distribution:", np.unique(true, return_counts=True))
print("Predicted label distribution:", np.unique(pred, return_counts=True))


True label distribution: (array([0]), array([42155]))
Predicted label distribution: (array([0, 1]), array([38486,  3669]))


In [18]:
print(df['oil_mine_presence'].value_counts())


oil_mine_presence
0    210772
Name: count, dtype: int64


In [19]:
# How many class 1s do we even have?
n_class_1 = (df['oil_mine_presence'] == 1).sum()
print(f"Number of class 1 samples: {n_class_1}")


Number of class 1 samples: 0


In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score

# --- Skip Attention Visualization (or make a safe dummy function) ---
def visualize_attention(*args, **kwargs):
    print("Attention visualization skipped to avoid memory overhead.")

# Call after model.eval():
visualize_attention(None, node_index=5)

# --- Validation Metrics ---
true = y_test.cpu().numpy()
pred = pred.cpu().numpy()

precision = precision_score(true, pred)
recall = recall_score(true, pred)
f1 = f1_score(true, pred)

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")


Attention visualization skipped to avoid memory overhead.
Precision: 0.000
Recall:    0.000
F1-score:  0.000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import precision_score, recall_score, f1_score

# # --- Visualize Attention Weights (assuming attention_scores is collected during forward pass) ---
# def visualize_attention(attention_scores, node_index, top_k=10):
#     # attention_scores: (num_edges, )
#     # node_index: int - which node to visualize
#     attention_scores = attention_scores.view(-1)
#     neighbors = (edge_index[0] == node_index).nonzero(as_tuple=True)[0]
#     values = attention_scores[neighbors].detach().cpu().numpy()
#     neighbor_ids = edge_index[1][neighbors].cpu().numpy()

#     plt.figure(figsize=(10, 4))
#     sns.barplot(x=neighbor_ids[:top_k], y=values[:top_k])
#     plt.title(f"Attention Weights for Node {node_index}")
#     plt.xlabel("Neighbor Node ID")
#     plt.ylabel("Attention Score")
#     plt.show()

# # Call after model.eval():
# visualize_attention(attention_weights, node_index=5)

# # --- Validation Metrics ---
# true = y_test.cpu().numpy()
# pred = pred.cpu().numpy()

# precision = precision_score(true, pred)
# recall = recall_score(true, pred)
# f1 = f1_score(true, pred)

# print(f"Precision: {precision:.3f}")
# print(f"Recall:    {recall:.3f}")
# print(f"F1-score:  {f1:.3f}")


In [None]:
import folium
from folium.plugins import MarkerCluster

# --------------------- Evaluation and Prediction ---------------------
model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index).argmax(dim=1)

# Original labels and predictions
true_labels = data.y.cpu().numpy()
pred_labels = pred.cpu().numpy()

# Get lat/lon for test data
# Convert test_mask to actual indices
test_indices = test_mask.nonzero(as_tuple=True)[0].cpu().numpy()

# Create a test DataFrame using these indices
df_test = df.iloc[test_indices].copy()

# Add true and predicted labels for these test samples
df_test["true_label"] = true_labels[test_mask.cpu().numpy()]
df_test["pred_label"] = pred_labels[test_mask.cpu().numpy()]

# df_test = df.iloc[test_mask.cpu().numpy()]
# df_test = df_test.copy()
# df_test["true_label"] = true_labels[test_mask.cpu().numpy()]
# df_test["pred_label"] = pred_labels[test_mask.cpu().numpy()]

# --------------------- Folium Map ---------------------
map_center = [df["latitude"].mean(), df["longitude"].mean()]
m = folium.Map(location=map_center, zoom_start=6)
marker_cluster = MarkerCluster().add_to(m)

# Plot all actual oil mines in test set
for _, row in df_test[df_test["true_label"] == 1].iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=4,
        color="green",
        fill=True,
        fill_color="green",
        popup="Actual Oil Mine"
    ).add_to(marker_cluster)

# Plot predicted oil mines not originally labeled (false positives)
for _, row in df_test[(df_test["true_label"] == 0) & (df_test["pred_label"] == 1)].iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=6,
        color="red",
        fill=True,
        fill_color="red",
        popup="Predicted Oil Mine"
    ).add_to(marker_cluster)

# Save or display the map
m.save("predicted_oil_mines_map.html")
m


In [None]:
import folium

# Filter test predictions where predicted != true
false_predictions = df_test[df_test["true_label"] != df_test["pred_label"]]

# Debug: Check if we have any
print("Number of incorrect predictions:", len(false_predictions))
print(false_predictions[['latitude', 'longitude', 'true_label', 'pred_label']].head())

# Optional: Use mean center of all false predictions
if not false_predictions.empty:
    map_center = [false_predictions["latitude"].mean(), false_predictions["longitude"].mean()]
else:
    map_center = [0, 0]  # Fallback center if no predictions

# Create map
m = folium.Map(location=map_center, zoom_start=2)

# Add markers
for _, row in false_predictions.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color="red",
        fill=True,
        fill_color="red",
        popup=f'True: {row["true_label"]}, Pred: {row["pred_label"]}',
    ).add_to(m)

# Display map in notebook
m


In [None]:
# Use all test nodes, not just incorrect ones
all_predictions = df_test.copy()

m = folium.Map(location=[all_predictions["latitude"].mean(), all_predictions["longitude"].mean()], zoom_start=3)

for _, row in all_predictions.iterrows():
    color = "green" if row["true_label"] == row["pred_label"] else "red"
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=4,
        color=color,
        fill=True,
        fill_opacity=0.7,
        fill_color=color,
        popup=f'True: {row["true_label"]}, Pred: {row["pred_label"]}',
    ).add_to(m)

m


In [None]:
sample = df_test.sample(n=10)  # Adjust 'n' based on your data

m = folium.Map(location=[sample["latitude"].mean(), sample["longitude"].mean()], zoom_start=3)

for _, row in sample.iterrows():
    folium.Marker(
        location=[row["latitude"], row["longitude"]],
        popup=f'True: {row["true_label"]}, Pred: {row["pred_label"]}',
        icon=folium.Icon(color='blue')
    ).add_to(m)

m


Below is a complete code block that:

Loads your new test.csv.

Preprocesses it (assuming it has latitude, longitude, and a label column).

Feeds it into your trained model.

Compares predictions to true labels.

Plots the results on a Folium map, highlighting:

Correct predictions in green

Incorrect predictions in red



##############################################################

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pandas as pd
import torch
from torch_geometric.data import Data
import folium

# Load test data (no labels expected)
df_test = pd.read_csv("preprocessed_mining_data.csv")

# Feature columns (drop lat/lon)
feature_cols = df_test.columns.difference(["latitude", "longitude"])
x_test = torch.tensor(df_test[feature_cols].values, dtype=torch.float)

# Build PyG Data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_data = Data(x=x_test).to(device)
model = model.to(device)

# Predict
model.eval()
with torch.no_grad():
    out = model(test_data.x)
    pred = out.argmax(dim=1).cpu().numpy()

# Add predictions to dataframe
df_test["predicted_oil_mine"] = pred

# Map
center_lat = df_test["latitude"].mean()
center_lon = df_test["longitude"].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=4)

# Visualize predictions
for _, row in df_test.iterrows():
    color = "green" if row["predicted_oil_mine"] == 1 else "gray"
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"Prediction: {'Oil Mine' if row['predicted_oil_mine'] == 1 else 'No Mine'}"
    ).add_to(m)

m.save("predicted_test_map.html")
m


In [None]:
from scipy.spatial import KDTree
import folium

# --- Step 1: Align test features with training ---
test_features = df_test[features]  # Use same feature list as training
test_scaled = scaler.transform(test_features)  # Reuse training scaler
x_test = torch.tensor(test_scaled, dtype=torch.float).to(device)

# --- Step 2: Build graph (KDTree) ---
tree = KDTree(test_scaled)
_, indices = tree.query(test_scaled, k=10)

edges = []
for i, neighbors in enumerate(indices):
    for j in neighbors:
        if i != j:
            edges.append((i, j))

test_edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous().to(device)

# --- Step 3: Run inference ---
model.eval()
with torch.no_grad():
    out = model(x_test, test_edge_index)
    pred = out.argmax(dim=1).cpu().numpy()

# --- Step 4: Add predictions to DataFrame ---
df_test["predicted_oil_mine"] = pred

# --- Step 5: Visualize on Folium map ---
m = folium.Map(location=[df_test["latitude"].mean(), df_test["longitude"].mean()], zoom_start=5)

for _, row in df_test.iterrows():
    color = "green" if row["predicted_oil_mine"] == 1 else "gray"
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"Prediction: {'Oil Mine' if row['predicted_oil_mine'] == 1 else 'No Mine'}"
    ).add_to(m)

m.save("predicted_test_map.html")
m


In [None]:
from scipy.spatial import KDTree
import folium

# Build new edge_index for test set
tree = KDTree(x_test.numpy())
_, indices = tree.query(x_test.numpy(), k=10)

edges = []
for i, neighbors in enumerate(indices):
    for j in neighbors:
        if i != j:
            edges.append((i, j))
test_edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous().to(device)

# Run prediction
model.eval()
test_data = Data(x=x_test.to(device))
with torch.no_grad():
    out = model(test_data.x, test_edge_index)
    pred = out.argmax(dim=1).cpu().numpy()

# Add predictions to DataFrame
df_test["predicted_oil_mine"] = pred

# Create Folium map
m = folium.Map(location=[df_test["latitude"].mean(), df_test["longitude"].mean()], zoom_start=5)

for _, row in df_test.iterrows():
    color = "green" if row["predicted_oil_mine"] == 1 else "gray"
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"Prediction: {'Oil Mine' if row['predicted_oil_mine'] == 1 else 'No Mine'}"
    ).add_to(m)

m.save("predicted_test_map.html")
m


In [None]:
# # Train/val/test split
# X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
#     x, y, np.arange(x.size(0)), test_size=0.2, stratify=y, random_state=42
# )
# X_train, X_val, y_train, y_val, train_idx, val_idx = train_test_split(
#     X_train, y_train, train_idx, test_size=0.2, stratify=y_train, random_state=42
# )

# # Convert to tensor indices
# train_mask = torch.zeros(x.size(0), dtype=torch.bool)
# val_mask = torch.zeros(x.size(0), dtype=torch.bool)
# test_mask = torch.zeros(x.size(0), dtype=torch.bool)
# train_mask[train_idx] = True
# val_mask[val_idx] = True
# test_mask[test_idx] = True


In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = GATNet(in_features=x.shape[1], hidden_features=8, out_features=2).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# x, edge_index, y = x.to(device), edge_index.to(device), y.to(device)
# train_mask, val_mask, test_mask = train_mask.to(device), val_mask.to(device), test_mask.to(device)

# # Training loop
# def train():
#     model.train()
#     optimizer.zero_grad()
#     out = model(x, edge_index)
#     loss = F.nll_loss(out[train_mask], y[train_mask])
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# # Evaluation
# def evaluate(mask):
#     model.eval()
#     with torch.no_grad():
#         out = model(x, edge_index)
#         pred = out[mask].argmax(dim=1)
#         acc = (pred == y[mask]).sum().item() / mask.sum().item()
#     return acc

# # Run training
# for epoch in range(1, 201):
#     loss = train()
#     val_acc = evaluate(val_mask)
#     if epoch % 10 == 0:
#         print(f"Epoch {epoch}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}")

# # Test accuracy
# test_acc = evaluate(test_mask)
# print(f"Test Accuracy: {test_acc:.4f}")


In [None]:
# import numpy as np
# import pandas as pd
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import matplotlib.pyplot as plt
# import seaborn as sns
# from torch.utils.data import DataLoader
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import precision_score, recall_score, f1_score
# from scipy.spatial import KDTree

# # Load data
# df = pd.read_csv("preprocessed_mining_data.csv")

# # Define conditions for oil presence
# oil_conditions = (
#     (df['P Wave Velocity (km/s)'] < 5.5) &
#     (df['S Wave Velocity (km/s)'] < 3.5) &
#     (df['Carbon Emission (ppm)'] > 350) &
#     (df['hrock_type'].isin([1, 2, 3]))
# )
# df['oil_mine_presence'] = np.where(oil_conditions, 1, 0)
# print(df['oil_mine_presence'].value_counts())

# # Encode categorical columns
# categorical_cols = ['hrock_type', 'arock_type', 'structure', 'orebody_fm']
# label_encoders = {col: LabelEncoder() for col in categorical_cols}
# for col in categorical_cols:
#     df[col] = label_encoders[col].fit_transform(df[col])

# # Feature columns
# features = ['latitude', 'longitude', 'P Wave Velocity (km/s)', 'S Wave Velocity (km/s)',
#             'Humidity (%)', 'Carbon Emission (ppm)', 'hrock_type', 'arock_type',
#             'structure', 'orebody_fm']

# df_features = df[features]
# df_target = df['oil_mine_presence']

# # Normalize features
# scaler = StandardScaler()
# df_features = pd.DataFrame(scaler.fit_transform(df_features), columns=features)
# nodes = df_features.values

# # Build KDTree for edge construction
# tree = KDTree(nodes)
# k = 10
# distances, indices = tree.query(nodes, k=k)

# # Build edge index
# graph_edges = []
# for src, neighbors in enumerate(indices):
#     for dst in neighbors:
#         if src != dst:
#             graph_edges.append([src, dst])
# edge_index = torch.tensor(graph_edges, dtype=torch.long).t().contiguous()

# # Convert to PyTorch tensors
# x = torch.tensor(nodes, dtype=torch.float)
# y = torch.tensor(df_target.values, dtype=torch.long)

# # Split train/test
# X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(x, y, np.arange(len(y)), test_size=0.2, random_state=42)

# # Define GAT Layer
# class GATLayer(nn.Module):
#     def __init__(self, in_features, out_features):
#         super(GATLayer, self).__init__()
#         self.W = nn.Linear(in_features, out_features, bias=False)
#         self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)))
#         nn.init.xavier_uniform_(self.a.data, gain=1.414)

#     def forward(self, x, edge_index):
#         h = self.W(x)
#         N = h.size(0)

#         src, dst = edge_index
#         a_input = torch.cat([h[src], h[dst]], dim=1)
#         e = F.leaky_relu(torch.matmul(a_input, self.a).squeeze())

#         attention = torch.zeros(N, N)
#         attention[src, dst] = e
#         attention = F.softmax(attention, dim=1)
#         h_prime = torch.matmul(attention, h)

#         return h_prime, attention.detach()

# # Model class
# class GAT(nn.Module):
#     def __init__(self, in_features, hidden_features, out_features):
#         super(GAT, self).__init__()
#         self.gat1 = GATLayer(in_features, hidden_features)
#         self.out = nn.Linear(hidden_features, out_features)

#     def forward(self, x, edge_index):
#         x, attn_weights = self.gat1(x, edge_index)
#         x = F.elu(x)
#         x = self.out(x)
#         return x, attn_weights

# # Initialize model
# model = GAT(x.size(1), 8, 2)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# # Training loop
# epochs = 100
# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()
#     out, _ = model(x, edge_index)
#     loss = criterion(out[idx_train], y[idx_train])
#     loss.backward()
#     optimizer.step()
#     if (epoch+1) % 10 == 0:
#         print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# # Evaluate
# model.eval()
# with torch.no_grad():
#     out, attention_weights = model(x, edge_index)
#     preds = out.argmax(dim=1)
#     y_true = y[idx_test].cpu().numpy()
#     y_pred = preds[idx_test].cpu().numpy()

#     precision = precision_score(y_true, y_pred)
#     recall = recall_score(y_true, y_pred)
#     f1 = f1_score(y_true, y_pred)
#     print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

#     # Export predictions
#     pred_df = pd.DataFrame({
#         "Index": idx_test,
#         "True Label": y_true,
#         "Predicted Label": y_pred
#     })
#     pred_df.to_csv("gat_predictions.csv", index=False)

#     # Visualize attention weights for a few nodes
#     attn_np = attention_weights.numpy()
#     avg_attention = attn_np.mean(axis=0)
#     plt.figure(figsize=(10, 6))
#     sns.heatmap(attn_np[:50, :50], cmap="viridis")
#     plt.title("Attention Weights (First 50 Nodes)")
#     plt.show()
