In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder

# Assume 'df' is your dataframe with features and 'Label' column 
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['Label'])

# Separate features and labels
X_train = train_df.drop('Label', axis=1)  
y_train = train_df['Label']

# One-hot encode categorical string columns
ohe = OneHotEncoder()

str_cols = X_train.select_dtypes(include='object').columns
str_cols_ohe = ohe.fit_transform(X_train[str_cols]).toarray()
str_cols_ohe_names = ohe.get_feature_names_out(str_cols)

# Replace string columns with one-hot encoded columns
X_train_ohe = X_train.drop(columns=str_cols).join(pd.DataFrame(str_cols_ohe, columns=str_cols_ohe_names, index=X_train.index))

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_ohe, y_train)

# Create dataframe of feature names and MI scores 
feat_scores = pd.DataFrame({'Feature': X_train_ohe.columns, 'MI Score': mi_scores})

# Select top k features based on MI score
k = 20
top_features = feat_scores.nlargest(k, 'MI Score')['Feature'].tolist()

print(f"Top {k} features selected by mutual information:")
print(top_features)



In [1]:
import networkx as nx

# Create an empty MultiDiGraph
G = nx.MultiDiGraph()

# Add nodes
G.add_nodes_from([1, 2, 3, 4])

# Add edges (including parallel edges)
G.add_edge(1, 2, weight=0.5)
G.add_edge(1, 2, weight=0.7)  # Parallel edge
G.add_edge(2, 3, weight=0.6)
G.add_edge(3, 4, weight=0.8)
G.add_edge(4, 1, weight=0.9)
G.add_edge(2, 4, weight=0.4)

import pickle

with open('../data/jp_morgan/pickled/test.pickle', 'wb') as f:
    pickle.dump(G, f)