In [13]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(0)

# Parameters for the dataset
num_samples = 10000  # Number of samples
num_features = 50    # Number of features

# Generate synthetic data
data = np.random.randn(num_samples, num_features)

# Introduce multicollinearity by making some features combinations of others
for i in range(0, num_features, 5):
    data[:, i+1] = data[:, i] + np.random.normal(0, 0.1, num_samples)  # Highly correlated with feature_i
    data[:, i+3] = data[:, i+2] + np.random.normal(0, 0.1, num_samples)  # Highly correlated with feature_i+2

# Create a DataFrame
column_names = [f'feature_{i+1}' for i in range(num_features)]
df = pd.DataFrame(data, columns=column_names)

# Save the dataset to a CSV file (optional)
# df.to_csv('large_synthetic_dataset.csv', index=False)


In [14]:
# Assuming df is already loaded from the previous step
corr_matrix = df.corr().abs()


In [15]:
import networkx as nx

# Set correlation threshold
threshold = 0.8

# Create a NetworkX graph
G = nx.Graph()

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            G.add_edge(corr_matrix.columns[i], corr_matrix.columns[j], weight=corr_matrix.iloc[i, j])


In [16]:
import plotly.graph_objects as go

# Generate positions for nodes
pos = nx.spring_layout(G)
edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=[node for node in G.nodes()],
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
    ))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='<br>Network graph of feature correlations',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[dict(
                        text="Data source: Synthetic Data",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002)],
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )
fig.show()
