<a href="https://colab.research.google.com/github/andrewbowen19/mastersThesisData698/blob/main/GNN_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install --q torch_geometric

In [3]:
import torch_geometric
from torch_geometric.datasets import Reddit
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import torch
from torch_geometric.data import HeteroData

In [4]:
class DataCleaner:

  def __init__(self):
    pass

  @staticmethod
  def snake_case_columns(df: DataFrame) -> DataFrame:
    """Convert all columns in a pandas DataFrame to `snake_case`"""
    new_cols = [c.lower().replace(" ", "_").replace("-", "_") for c in df.columns]

    df.columns = new_cols
    return df

  @staticmethod
  def trim_whitespace(df: DataFrame) -> DataFrame:
    """Trim all leading and trailing whitespace"""
    for c in df.columns:
      if isinstance(df.dtypes[c], np.dtypes.ObjectDType):
        df[c] = df[c].str.strip()
    return df

  @staticmethod
  def date_parser(df: DataFrame, column_name: str, format: str = "%m/%d/%y") -> DataFrame:
    """Parse date-like columns in a dataframe"""
    df[column_name] = pd.to_datetime(df[column_name], format=format)
    return df

cleaner = DataCleaner()


## Datasets

The [CEDR Data Catalog](https://oriseapps.orau.gov/cedr/pdf/cedr-catalog-2021-508.pdf) is a good resource with documentation on the available datasets from the database.

- [Dataset 1](https://oriseapps.orau.gov/cedr/search_results.aspx?DataSet=MFMM98W1)
- [Dataset 2](https://oriseapps.orau.gov/cedr/search_results.aspx?DataSet=MFMM98W2)

## Reading in and Preprocessing our datasets

The datasets are available for [download here](https://oriseapps.orau.gov/cedr/search_results.aspx?DataSet=MFMM98W1). There are chemical agent, industrial hygiene, and building lists for the below sites:

- Hanford Site
- Los Alamos Natinoal Laboratory
- Savannah River Site
- Oak Ridge National Laboratory

We'll need to do some data wrangling before we put this into a [`HeteroData` dataset suitable for PyG](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.HeteroData.html#torch_geometric.data.HeteroData)

In [5]:
site_mapping = {
    "Hanford Site": {"code": "hanford",
                     "filename_prefix": "Hanford"},
    "Los Alamos National Laboratory": {"code": "lanl",
                     "filename_prefix": "LANL"},
    "Savannah River Site": {"code": "srs",
                     "filename_prefix": "SRS"},
    "Oak Ridge National Laboratory": {"code": "ornl",
                     "filename_prefix": "ORNL"},
}

data_types = ["Buildings", "ChemicalAgents", "IH"]

def construct_df(data_type: str = "Buildings"):
  dfs = []
  for site, info in site_mapping.items():
    prefix = info.get("filename_prefix")
    filename = f"/content/{prefix}-{data_type}.csv"

    dat = pd.read_csv(filename)
    dfs.append(dat)


  df = pd.concat(dfs)

  return df

# Read in datasets and combine across sites
buildings = construct_df("Buildings")
agents = construct_df("ChemicalAgents")
ih_data = construct_df("IH")

In [6]:
# Do some basic data cleaning/preprocessing

# Trim whitespace
buildings = cleaner.snake_case_columns(buildings)
agents = cleaner.snake_case_columns(agents)
ih_data = cleaner.snake_case_columns(ih_data)

buildings = cleaner.trim_whitespace(buildings)
agents = cleaner.trim_whitespace(agents)
ih_data = cleaner.trim_whitespace(ih_data)



## Converting to a Graph Dataset

In our case, the nodes fo our grpah represent different types of entities. Hence, we'll need to constrct the a dataset representing a [heterogeneous graph for PyG](https://pytorch-geometric.readthedocs.io/en/latest/notes/heterogeneous.html?highlight=heterogeneous%20graph#creating-heterogeneous-graphs)

Here's a [helpful tutorial](https://colab.research.google.com/drive/1_eR7DXBF3V4EwH946dDPOxeclDBeKNMD?usp=sharing#scrollTo=ljgXqQRsfqNs) on converting tabular datasets to a heterogeneous graph dataste

In [37]:
# OneHotEncode
ih_data


Unnamed: 0,date,dep_grp,descript,facility,jobtitle,location,quantity,referenc,room,sampleid,...,uranyl nitrate,vanadium,vinyl chloride,"welding fumes, NOS","wood dust, NOS",xylene,zinc,zinc oxide,zirconium,building_id_y
0,8/19/81,200W,06437:Graphite dust sampling machine shop; n=1...,HANF,461,202-S,0.860000,HEX78_83,,081981_001,...,0,0,0,0,0,0,0,0,0,254
1,10/23/81,200E,06572:Microwave survey. perimeter microwave se...,HANF,,209-E,5.000000,HEX78_83,,102381_001,...,0,0,0,0,0,0,0,0,0,263
2,5/28/81,300,06293:Evaluation of employee exposure to solve...,HANF,151,305,2.000000,HEX78_83,BLDPS,052881_001,...,0,0,0,0,0,0,0,0,0,402
3,1/7/57,,Sample industrial vacuum cleaner exhaust to de...,LANL,,305,0.050000,LANL/IH/205,,010757_001,...,0,0,0,0,0,0,0,0,0,402
4,2/4/81,300,06075:Stick and heliarc welding-308 shop area;...,HANF,680 Welder,308,0.077000,HEX78_83,231,020481_001,...,0,0,0,0,0,0,0,0,0,408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74462,12/31/73,BIOLOGY,"Use in mutagenicity and enzyme studies, Room 1...",ORNL,,9210,,ORNL/IH/179,133,123173_048,...,0,0,0,0,0,0,0,0,0,752
74463,12/31/73,BIOLOGY,"Use in mutagenicity and enzyme studies, Room 1...",ORNL,,9210,,ORNL/IH/179,133,123173_048,...,0,0,0,0,0,0,0,0,0,749
74464,12/31/73,BIOLOGY,"Use in mutagenicity and enzyme studies, Room 1...",ORNL,,9210,,ORNL/IH/179,133,123173_048,...,0,0,0,0,0,0,0,0,0,750
74465,12/31/73,BIOLOGY,"Use in mutagenicity and enzyme studies, Room 1...",ORNL,,9210,,ORNL/IH/179,133,123173_048,...,0,0,0,0,0,0,0,0,0,751


In [38]:
# Lookup the building ID number

buildings['building_id'] = buildings.index.values

ih_data = ih_data.merge(buildings[['location', 'building_id']], on="location", how="inner")
ih_data['chemical_id'] = ih_data.index.values

# Map the site feature to integer values
site_mapping = {"LANL": 1, "SRP": 2, "ORNL": 3, "HANF": 4}
buildings['site'] = buildings['facility'].map(site_mapping)

buildings[['building_id', "site"]]

Unnamed: 0,building_id,site
0,0,4
1,1,4
2,2,4
3,3,4
4,4,4
...,...,...
791,791,3
792,792,3
793,793,3
794,794,3


In [42]:
# Set up feature tensores for both buildings and chemicals
building_features = torch.tensor(buildings[['building_id', "site"]].values.transpose(), dtype=torch.long)
chemical_features = torch.tensor(ih_data['chemical_id'].values.transpose(), dtype=torch.long)

# Create a HeteroData object
data = HeteroData()

# Add node features (buildings and chemicals) to the HeteroData object
data['building'].x = building_features
data['chemical'].x = chemical_indices

In [43]:
# Flip the order for the edge index so it's building (contains) chemical
edge_index = ih_data[['chemical_id', "building_id"]].values.transpose()
data['building', 'contains', 'chemical'].edge_index = torch.tensor(edge_index, dtype=torch.long)

In [None]:
# Visualizing our graph dataset
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

# Convert HeteroData to NetworkX graph
graph = to_networkx(data)

# Visualize the NetworkX graph
plt.figure(figsize=(15, 12))

# Draw nodes for each type with different colors
node_colors = {'building': ('blue',  5),
               'chemical': ('green', 3)}
# for node_type, node_setting in node_colors.items():
    # nodes = [node for node, data in graph.nodes(data=True) if data['type'] == node_type]
nx.draw_networkx_nodes(graph, pos=nx.spring_layout(graph),
                           nodelist=nodes, node_color=node_setting[0],
                           node_size=node_setting[1], label=None)

# Draw edges representing relationship between buildings and chemicals
edges = data['building', 'contains', 'chemical'].edge_index.t().tolist()
graph.add_edges_from(edges, type=('building', 'contains', 'chemical'))
nx.draw_networkx_edges(graph, pos=nx.spring_layout(graph), edge_color='black', arrows=False, width=0.5)

# Add legend
node_legend = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color[0], markersize=3, label=label) for label, color in node_colors.items()]
plt.legend(handles=node_legend)

plt.title("Department of Energy Chemical Agents and Buildings: 1998")
plt.axis('off')
plt.show()


## Model Building


TODO: Parse this code from the [PyG tutorial](https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html)

In [14]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, HeteroConv, Linear, SAGEConv


class HeteroGNN(torch.nn.Module):
    """
    Heterogenous Model:
      - https://pytorch-geometric.readthedocs.io/en/latest/tutorial/heterogeneous.html#using-the-heterogeneous-convolution-wrapper
    """
    def __init__(self, hidden_channels, out_channels, num_layers):
        super().__init__()

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                ('building', 'contains', 'chemical'): SAGEConv((-1, -1), hidden_channels)
            }, aggr='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: x.relu() for key, x in x_dict.items()}
        return self.lin(x_dict['building'])

model = HeteroGNN(hidden_channels=64, out_channels=1,
                  num_layers=2)



In [16]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = F.cross_entropy(out['building'], data['chemical'].y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)

train()

NameError: name 'optimizer' is not defined