In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import torch
import torch_geometric
import torch_scatter
import torch_sparse
import torch_cluster
import torch_spline_conv
import networkx as nx

import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV

from scipy.io import loadmat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


# **Load in Data Frames**

In [None]:
# categorical variable dataframe

file_path_trainC = "/content/drive/My Drive/data csv/TRAIN_CATEGORICAL.csv"
df1 = pd.read_csv(file_path_trainC)


In [None]:
train_cat = pd.read_csv(file_path_trainC)
train_cat.head()

In [None]:
train_cat.columns

In [None]:
# Functional Connection Matrices

file_path_trainFCM = "/content/drive/My Drive/data csv/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"
df2 = pd.read_csv(file_path_trainC)

In [None]:
train_FCM = pd.read_csv(file_path_trainFCM)

In [None]:
train_FCM.head()

In [None]:
train_FCM.columns

In [None]:
print(train_FCM)

## Preprocess FCM to feed into GNN

In [None]:
train_FCM = train_FCM.drop(columns=['participant_id'])

def vector_to_adjacency(vector):
    #Converts a vector to an adjacency matrix

    adj_matrix = np.zeros((200, 200))

    # Fill the lower triangle of the matrix
    triu_indices = np.triu_indices(200, k=1)
    adj_matrix[triu_indices] = vector
    adj_matrix += adj_matrix.T  # Make it symmetric

    return adj_matrix

adj_matrices = np.array([vector_to_adjacency(row) for row in train_FCM.to_numpy()])
print(adj_matrices.shape)

In [None]:
def adjacency_to_edge_list(adj_matrix, threshold=0.1):
    edge_index = np.array(np.where(adj_matrix > threshold))  # Only keep strong edges, helps with multicollinearity?
    edge_weight = adj_matrix[edge_index[0], edge_index[1]]

    return torch.tensor(edge_index, dtype=torch.long), torch.tensor(edge_weight, dtype=torch.float)

edge_data = [adjacency_to_edge_list(adj) for adj in adj_matrices]

#unpack weights and indices
edge_indices = [data[0] for data in edge_data]
edge_weights = [data[1] for data in edge_data]

print(edge_indices[0].shape) #shape will have two rows for source and target nodes
print(edge_weights[0].shape)

In [None]:
#convert all edge lists to graph objects
graphs = [torch_geometric.data.Data(x=torch.eye(200), edge_index=e_idx, edge_attr=e_wt)
          for e_idx, e_wt in zip(edge_indices, edge_weights)]

print(graphs[0])

In [None]:
#validate conversions

sample_adj_matrix = adj_matrices[0]

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(sample_adj_matrix, cmap="viridis", square=True)
plt.title("Adj Matrix Heatmap")
plt.show()


In [None]:
def visualize_graph(edge_index, title="Graph Visualization", node_limit=50):
    """
    Plots a NetworkX graph based on edge_index.
    """
    G = nx.Graph()
    edges = edge_index.numpy().T

    G.add_edges_from(edges[:node_limit])

    plt.figure(figsize=(8, 6))
    nx.draw(G, node_size=50, with_labels=False, edge_color="blue")
    plt.title(title)
    plt.show()

# graph first participant's graph
visualize_graph(edge_indices[0])

In [None]:
# Quantitative variable train dataframe

file_path_trainQ = '/content/drive/My Drive/data csv/TRAIN_QUANTITATIVE.csv'
train_Quant = pd.read_csv(file_path_trainQ)
train_Quant.head()

In [None]:
train_Quant.columns

In [None]:
# ADHD and Sex solutions dataframe for model training

file_path_trainS = '/content/drive/My Drive/data csv/TRAINING_SOLUTIONS.csv'
train_Solutions = pd.read_csv(file_path_trainS)
train_Solutions.head()

In [None]:
train_Solutions.columns

# **Exploratory Data Analysis**

Use .info() and .describe() to summarize each dataset.

In [None]:
train_cat.info()

Understand the distribution of the categorical variables with .value_counts().

In [None]:
# Barratt_Barratt_P2_Occ - Barratt Simplified Measure of Social Status - Parent 2 Occupation
train_cat['Barratt_Barratt_P2_Occ'].value_counts()

# look back at the dictionary to see what category these integers [0, 45, 35] represent.

Notice the parent 2 occupation with the most frequency is 0: homemaker, stay at home parent.

**Visualize distributions:**

In [None]:
sns.countplot(x='Barratt_Barratt_P2_Occ', data=train_cat[['Barratt_Barratt_P2_Occ']])
plt.title(f"Distribution of Barratt_Barratt_P2_Occ")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution of MRI_Track_Age_at_Scan
train_Quant['MRI_Track_Age_at_Scan'].hist(figsize=(12, 10), bins=20)
plt.suptitle("MRI_Track_Age_at_Scan Distributions")
plt.xlabel('MRI_Track_Age_at_Scan')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# Gender distribution
train_Solutions['Sex_F'].value_counts()

In [None]:
train_Solutions['Sex_F'].value_counts().plot(kind='bar', color='blue')
plt.title('Gender Distribution')
plt.xlabel('Gender (0 = Male, 1 = Female)')
plt.ylabel('Count')
plt.show()