## Iris Dataset:
Contains measurements of different species of iris flowers and is commonly used for classification tasks.

-	150 rows (samples), 5 columns (features + label)
-	Target (Label): species → The type of iris flower (Iris-setosa, Iris-versicolor, Iris-virginica)


# 1. Dataset Exploration

In [15]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
df = pd.read_csv(url, names=columns)

print(f"Number of rows: {df.shape[0]}")  # Rows
print(f"Number of columns: {df.shape[1]}")  # Columns
print(f"Column names: {df.columns}")  # List of column names
print()
print("Missing values: ")  
print(df.isnull().sum()) # Count missing values per column
print()
print(df.head())  # Show first 5 rows


Number of rows: 150
Number of columns: 5
Column names: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

Missing values: 
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


## 2. Generate Correlation Matrix

In [1]:
import pandas as pd
import numpy as np

# Step 1: Load the iris dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# Column names based on Iris dataset description
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
df = pd.read_csv(url, names=columns)
print(len(df))

# Drop the categorical column since we only need numerical features
df_numeric = df.drop(columns=["species"])

# Step 2: Compute the correlation matrix and convert it to a numpy array
correlation_matrix = df_numeric.corr().to_numpy()

# Print correlation matrix
print("\n🔹 Correlation Matrix from Real Dataset:")
print(correlation_matrix)


150

🔹 Correlation Matrix from Real Dataset:
[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   1.          0.9627571 ]
 [ 0.81795363 -0.35654409  0.9627571   1.        ]]


# 2. Generate Dataset

### How Are We Turning the Iris Dataset into a Graph Dataset?

- 1️⃣ Nodes = Dataset features (sepal_length, petal_width, etc.).
- 2️⃣ Edges = Based on strong correlations in the dataset.
- 3️⃣ Node Features = Mean values of each feature.
- 4️⃣ Stored in a PyTorch Geometric Data object for graph-based learning.

In [7]:
import torch
from torch_geometric.data import InMemoryDataset, Data
import pandas as pd
import numpy as np
import os #manages file system operations 

class CorrelationGraphDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        """
        Initializes the dataset and forces processing by deleting old data.
        """
        print("\n DEBUG: Dataset class initialized!")  # Print when dataset is created

        super().__init__(root, transform, pre_transform, pre_filter)

        # FORCE PROCESSING BY DELETING OLD DATA
        processed_path = self.processed_paths[0]
        print("Printing processed path for the first time:", processed_path)
        if os.path.exists(processed_path):
            print(f"\n DEBUG: Deleting existing dataset: {processed_path}")
            os.remove(processed_path)  # Remove old processed file to force reprocessing

        print("DEBUG: Calling process() now...")
        self.process()

    @property
    def processed_file_names(self):
        """Defines the processed dataset file name."""
        return ['graph_data.pt']

    def process(self):
        """
        Converts a real correlation matrix into a PyTorch Geometric dataset.
        """
        print("DEBUG: process() method is running...")  # Print first thing inside process()

        # Step 1: Load a real dataset (Iris dataset)
        print("\n DEBUG: Loading dataset...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
        columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
        df = pd.read_csv(url, names=columns)

        print("\n DEBUG: Dataset loaded successfully!")
        print(df.head())  # Show first few rows

        # Drop categorical column
        df_numeric = df.drop(columns=["species"])

        # Compute correlation matrix
        print("\n DEBUG: Computing correlation matrix...")
        correlation_matrix = df_numeric.corr().to_numpy()

        print("\n DEBUG: Correlation matrix computed!")
        print(correlation_matrix)  # Show matrix

        # Extract edges
        print("\n DEBUG: Extracting edges...")
        threshold = 0.6  # Keep strong correlations
        edges = np.where(np.abs(correlation_matrix) > threshold)
        print("!!!!Edges", edges)
        edge_index = torch.tensor(np.array(edges), dtype=torch.long)

        print("\n DEBUG: Edges extracted!")
        print(edge_index)

        # Step 3: Create node features
        print("\n DEBUG: Computing node features...")
        print(df_numeric.mean())  # Print mean before conversion

        try:
            node_features = torch.tensor(df_numeric.mean().to_numpy().reshape(-1, 1), dtype=torch.float)
            print("\n DEBUG: Node features created!")
        except Exception as e:
            print("\n ERROR in node feature conversion:", e)
            return

        print("\n🔹 Final Node Features (x):")
        print(node_features)

        # Create a PyG `Data` object
        print("\n DEBUG: Creating graph data object...")
        graph_data = Data(x=node_features, edge_index=edge_index)

        print("\n DEBUG: Graph Data Object created!")
        print(graph_data)

        # Save dataset
        print("\n DEBUG: Saving dataset...")
        self.save([graph_data], self.processed_paths[0])

        print("\n DEBUG: Dataset saved successfully!")

# Ensure script runs correctly
if __name__ == "__main__":
    print("\n DEBUG: Starting script execution...")

    # Create dataset instance
    dataset = CorrelationGraphDataset(root="/tmp/dataset_folder") 
    print(dataset) 
  
    print("\n DEBUG: Dataset instance created successfully!")



 DEBUG: Starting script execution...

 DEBUG: Dataset class initialized!
Printing processed path for the first time: /tmp/dataset_folder/processed/graph_data.pt

 DEBUG: Deleting existing dataset: /tmp/dataset_folder/processed/graph_data.pt
DEBUG: Calling process() now...
DEBUG: process() method is running...

 DEBUG: Loading dataset...

 DEBUG: Dataset loaded successfully!
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

 DEBUG: Computing correlation matrix...

 DEBUG: Correlation matrix computed!
[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   