# Pulse AI Assistant - Data Exploration

This notebook explores the genetic network data for the Pulse AI Assistant project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
genes_df = pd.read_csv('../data/raw/N_table_filtered.csv')
edges_df = pd.read_csv('../data/raw/E_table_filtered.csv', sep=';')

print(f"Genes dataset: {genes_df.shape}")
print(f"Edges dataset: {edges_df.shape}")

In [None]:
# Basic info about genes
print("=== Genes Dataset Info ===")
print(genes_df.info())
print("\nFirst 5 genes:")
print(genes_df[['display name', 'target::family', 'degree.layout']].head())

In [None]:
# Distribution of connections
plt.figure(figsize=(10, 5))
sns.histplot(data=genes_df, x='degree.layout', bins=20)
plt.title('Distribution of Gene Connections')
plt.xlabel('Number of Connections (degree)')
plt.ylabel('Count')
plt.show()

In [None]:
# Top connected genes
top_genes = genes_df.nlargest(10, 'degree.layout')
plt.figure(figsize=(12, 6))
sns.barplot(data=top_genes, x='degree.layout', y='display name')
plt.title('Top 10 Most Connected Genes')
plt.xlabel('Number of Connections')
plt.ylabel('Gene Name')
plt.tight_layout()
plt.show()

In [None]:
# Protein families distribution
family_counts = genes_df['target::family'].value_counts()
plt.figure(figsize=(10, 6))
family_counts.plot(kind='bar')
plt.title('Distribution of Protein Families')
plt.xlabel('Protein Family')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Next Steps
1. Generate embeddings for gene descriptions
2. Test semantic similarity search
3. Build hybrid search prototype