# Task
Convert this data into a graph

Here is all the data you need:
"citing.txt"
"musae_git_edges.csv"
"musae_git_features.json"
"musae_git_target.csv"
"README.txt"

## Data loading

### Subtask:
Load the provided CSV and JSON files into pandas DataFrames.


**Reasoning**:
Load the provided CSV and JSON files into pandas DataFrames and print the first few rows of each to verify.



In [None]:
import pandas as pd
import json

# Load musae_git_edges.csv
try:
    df_edges = pd.read_csv('musae_git_edges.csv')
    display(df_edges.head())
except FileNotFoundError:
    print("Error: musae_git_edges.csv not found.")
    df_edges = None

# Load musae_git_features.json
try:
    with open('musae_git_features.json', 'r') as f:
        data = json.load(f)
    df_features = pd.DataFrame.from_dict(data, orient='index')
    display(df_features.head())
except FileNotFoundError:
    print("Error: musae_git_features.json not found.")
    df_features = None
except json.JSONDecodeError:
    print("Error: Invalid JSON format in musae_git_features.json")
    df_features = None

# Load musae_git_target.csv
try:
    df_target = pd.read_csv('musae_git_target.csv')
    display(df_target.head())
except FileNotFoundError:
    print("Error: musae_git_target.csv not found.")
    df_target = None

## Data exploration

### Subtask:
Explore the loaded dataframes (`df_edges`, `df_features`, and `df_target`) to understand their structure, identify key columns, and determine the relationships between them.


**Reasoning**:
Inspect the dataframes to understand their structure, identify key columns, and determine relationships between them.  This includes examining shapes, data types, descriptive statistics, and unique values in potential identifier columns.



In [None]:
# Inspect the shape and data types of each DataFrame
print("df_edges shape:", df_edges.shape)
print("df_edges info:")
display(df_edges.info())

print("\ndf_features shape:", df_features.shape)
print("df_features info:")
display(df_features.info())

print("\ndf_target shape:", df_target.shape)
print("df_target info:")
display(df_target.info())


# Identify node and edge columns
print("\ndf_edges unique values in id_1:", df_edges['id_1'].nunique())
print("df_edges unique values in id_2:", df_edges['id_2'].nunique())

print("\ndf_features index (potential node IDs):", df_features.index.nunique())
print("First 5 values of the df_features index:", df_features.index.unique()[:5])

print("\ndf_target unique values in id:", df_target['id'].nunique())
print("First 5 values of the id column in df_target:", df_target['id'].unique()[:5])

# Relationship analysis: check for missing values and inconsistencies
print("\nMissing values in df_edges:", df_edges.isnull().sum().sum())
print("Missing values in df_features:", df_features.isnull().sum().sum())
print("Missing values in df_target:", df_target.isnull().sum().sum())

# Visual inspection
print("\nFirst 5 rows of df_edges:")
display(df_edges.head())

print("\nFirst 5 rows of df_features:")
display(df_features.head())

print("\nFirst 5 rows of df_target:")
display(df_target.head())

## Data wrangling

### Subtask:
Merge the three dataframes: `df_edges`, `df_features`, and `df_target` into a single dataframe suitable for graph creation.


**Reasoning**:
Convert the index of `df_features` to numeric and merge the three dataframes. Handle potential errors during index conversion and missing values after merging.



In [None]:
# Convert the index of df_features to numeric
try:
    df_features.index = pd.to_numeric(df_features.index, errors='raise')
    df_features.index = df_features.index.astype('int64')
except ValueError as e:
    print(f"Error converting index to numeric: {e}")
    # Handle the error, e.g., by removing or imputing problematic rows
    # For this example, I will skip the problematic rows.
    # In a real-world scenario, investigate the problematic rows and choose the appropriate method.
    # problematic_indices = df_features.index[pd.to_numeric(df_features.index, errors='coerce').isnull()]
    # df_features = df_features.drop(problematic_indices)
    # df_features.index = pd.to_numeric(df_features.index, errors='coerce').dropna().astype('int64')

# Merge df_features and df_target
df_merged = pd.merge(df_features, df_target, left_index=True, right_on='id', how='inner')

# Merge with df_edges
df_final = pd.merge(df_merged, df_edges, left_on='id', right_on='id_1', how='inner')

# Handle missing values
# Check for missing values after merging
print("Missing values in the final dataframe:\n", df_final.isnull().sum())
# Since the missing values are mostly in the feature columns, we can impute them with the mean.
for col in df_final.columns:
    if df_final[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df_final[col]):
            df_final[col] = df_final[col].fillna(df_final[col].mean())
        else:
            #Handle other data types if needed
            pass

print(df_final.shape)
display(df_final.head())

## Data preparation

### Subtask:
Prepare the merged dataframe `df_final` for graph creation.


**Reasoning**:
Identify node and edge columns, validate data types, and handle any remaining missing values in `df_final`.



In [None]:
# Identify node and edge columns
source_nodes = df_final['id_1']
target_nodes = df_final['id_2']

# Validate data types
# Check if node identifiers are already numeric
if not pd.api.types.is_numeric_dtype(source_nodes):
    try:
        source_nodes = pd.to_numeric(source_nodes, errors='raise')
    except ValueError as e:
        print(f"Error converting source nodes to numeric: {e}")
        # Handle the error appropriately, e.g., by removing or imputing problematic values
        # For demonstration, I'm skipping the problematic rows.
        # In real-world scenarios, decide based on data understanding
        pass

if not pd.api.types.is_numeric_dtype(target_nodes):
    try:
        target_nodes = pd.to_numeric(target_nodes, errors='raise')
    except ValueError as e:
        print(f"Error converting target nodes to numeric: {e}")
        # Handle the error appropriately
        pass

# Update df_final with the corrected data types (if necessary)
df_final['id_1'] = source_nodes
df_final['id_2'] = target_nodes


# Handle any remaining missing values (already handled in previous step)
# Check if there are any remaining missing values
print("Missing values in the final dataframe:\n", df_final.isnull().sum())

# No further action needed as missing values are already imputed.


# Feature scaling
# Optional: Scale numerical features if needed for visualization or analysis.
# Select numerical columns to scale
numerical_cols = df_final.select_dtypes(include=['number']).columns
# Exclude node ID columns
numerical_cols = numerical_cols.drop(['id_1', 'id_2', 'id', 'ml_target'])

from sklearn.preprocessing import MinMaxScaler

# Initialize a MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical features
df_final[numerical_cols] = scaler.fit_transform(df_final[numerical_cols])

display(df_final.head())

## Summary:

### Q&A
No questions were posed in the provided task.

### Data Analysis Key Findings
* Three CSV and JSON files (`musae_git_edges.csv`, `musae_git_features.json`, and `musae_git_target.csv`) were loaded into pandas DataFrames.
* `df_features` contained a substantial number of missing values (increasing with higher column indices) that were later imputed with the mean for numerical features.
* The index of `df_features` was of type 'object' while the corresponding 'id' columns in other dataframes were 'int64'. This required a type conversion to 'int64' to enable merging.
* The number of unique values in `id_1` and `id_2` of `df_edges` (30,855 and 30,195 respectively) was lower than the number of unique nodes in `df_features` and `df_target` (37,700), indicating some nodes might not have any connections.
* Numerical features in `df_final` were scaled using `MinMaxScaler` to a range of 0 to 1, excluding node IDs, `id`, and the target variable `ml_target`.
* The final merged dataframe, `df_final`, has a shape of (289003, 47) and includes node features, target labels, and edge information.

### Insights or Next Steps
* Investigate the meaning of the features in `df_features` to potentially improve imputation strategies beyond mean imputation.
* Explore the nodes without connections identified in `df_edges` to understand their significance in the context of the dataset.
* Visualize the constructed graph to identify patterns and relationships between nodes.


In [None]:
# Step 2: Install Necessary Libraries
# You can install networkx in Colab using pip
#!pip install networkx==3.2.1

# Step 3: Define Node Features and Labels
# Identify the node ID column, edge columns, feature columns, and the target label column
node_id_col = 'id'
source_node_col = 'id_1'
target_node_col = 'id_2'
target_label_col = 'ml_target' # Based on the notebook's description of the target

# Identify feature columns. Exclude IDs and the target label.
feature_cols = [col for col in df_final.columns if col not in [node_id_col, source_node_col, target_node_col, target_label_col]]


# Step 4: Create the Graph using NetworkX
import networkx as nx

# Create an empty graph. Use nx.Graph() for undirected based on "mutual follower relationships".
G = nx.Graph()

# Add nodes with features and labels
# We'll iterate through the unique node IDs present in the edge list and target data
unique_nodes = pd.concat([df_final[source_node_col], df_final[target_node_col], df_final[node_id_col]]).unique()

for node_id in unique_nodes:
    # Find the rows in df_final corresponding to this node as either source, target, or id
    node_data = df_final[(df_final[source_node_col] == node_id) | (df_final[target_node_col] == node_id) | (df_final[node_id_col] == node_id)].iloc[0] # Take the first matching row

    # Extract features for this node
    features = node_data[feature_cols].to_dict()

    # Extract the target label
    label = node_data[target_label_col]

    # Add the node to the graph with features and label
    G.add_node(node_id, label=label, features=features)


# Add edges
# Iterate through the edges in df_final
for index, row in df_final.iterrows():
    source_node = row[source_node_col]
    target_node = row[target_node_col]
    G.add_edge(source_node, target_node)

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

In [None]:
# prompt: print node with their features

# Print node data (features and label) for a few nodes
print("\nNode data for a few nodes:")
for i, node_id in enumerate(list(G.nodes)[:5]): # Print data for the first 5 nodes
    node_data = G.nodes[node_id]
    print(f"\nNode ID: {node_id}")
    print(f"  Label: {node_data.get('label')}")
    print(f"  Features: {node_data.get('features')}")

In [None]:
# prompt: print node labels

# Print the labels of the nodes
print("\nNode labels:")
for node, data in G.nodes(data=True):
  print(f"Node {node}: Label = {data.get('label')}")