In [7]:
import numpy as np
import seaborn as sns; sns.set()
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [12, 10]

font = {'size'   : 15, }
axes = {'labelsize': 'large', 'titlesize': 'large'}

mpl.rc('font', **font)
mpl.rc('axes', **axes)

import pandas as pd
import data_utils


In [19]:
# Paths to data csvs
wd = "/home/behzad/Documents/barnes_lab/cplusplus_software/speed_test/repressilator/cpp"
data_dir = wd + "/output/two_species_big_3/Population_0"
distances_path = data_dir + "/distances.csv"
eigenvalues_path = data_dir + "/eigenvalues_do_fsolve_state.csv"
model_space_report_path = data_dir + "/model_space_report.csv"

# Load dataframes
distances_df = pd.read_csv(distances_path)
eigenvalues_df = pd.read_csv(eigenvalues_path)

# Make distances numeric
distances_df = data_utils.distances_pre_processing(distances_df)

In [20]:
# Join eigenvalues and distances on the simulation index, batch number and model_ref
joint_df = pd.merge(left=eigenvalues_df, right=distances_df, how='inner', on=['sim_idx', 'batch_num', 'model_ref'])
joint_df.reset_index()

# Add columns indicating check results and summary stats
joint_df = data_utils.species_sustained(joint_df)
joint_df = data_utils.make_max_eig(joint_df)
join_df = data_utils.make_sum_eig(joint_df)
joint_df = data_utils.all_negative_eigs(joint_df)
joint_df = data_utils.all_real_eigs(joint_df)
joint_df = data_utils.all_positive_eigs(joint_df)
joint_df['sum_std'] = joint_df['d2'] + joint_df['d5']
joint_df['sum_grad'] = joint_df['d1'] + joint_df['d4']

# Set accepted or rejected particles
mask = (joint_df['d1'] < 100) & (joint_df['d2'] < 10) & (joint_df['d3'] > 0) & (joint_df['d4'] < 100) & (joint_df['d5'] < 10) & (joint_df['d6'] > 0)
joint_df.loc[mask, 'Accepted'] = True
joint_df.loc[~mask, 'Accepted'] = False

In [21]:
def generate_input_data(df):
    # Model 30 has 5 eigenlvaues, 0 to 4
    eig_cols = [x for x in df.columns if 'eig' in x][0: 10]
    real_eig_cols = [x for x in eig_cols if 'real' in x]
    imag_eig_cols = [x for x in eig_cols if 'imag' in x]
    
    all_inputs = []
    all_labels = []

    for index, row in df.iterrows():
        input_set = []
        for idx, i in enumerate(real_eig_cols):
            
            input_set.append(row[real_eig_cols[idx]])
            input_set.append(row[imag_eig_cols[idx]])

        illegal = [True for val in input_set if np.isnan(val)]

        if True in illegal:
            continue

        all_inputs.append(input_set)

        label = None
        if row['Accepted']:
            label = 1

        else:
            label = 0

        all_labels.append(label)


    illegal_inputs = []
    for idx, row in enumerate(all_inputs):
        for val in row:
            if np.isnan(val):
                print(val)
                illegal_inputs.append(idx)
    
    return all_inputs, all_labels


In [24]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(joint_df, test_size=0.2)

train_inputs, train_labels = generate_input_data(train_df)    

from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_inputs, train_labels)

print(clf.feature_importances_)

test_inputs, test_labels = generate_input_data(test_df)    
print(len(test_labels))

pred = clf.predict(test_inputs)

diff = 0
for idx, i in enumerate(test_labels):
    if pred[i] - i == 0:
        continue

    else:
        diff +=1
        
print(diff)


[0.06706739 0.00076426 0.10689253 0.0008075  0.05531696 0.70490433
 0.00360705 0.         0.06063998 0.        ]
3298
1655


In [25]:
import graphviz

eig_cols = [x for x in joint_df.columns if 'eig' in x][0:10]


print(eig_cols)
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=eig_cols,
                                filled=True, rounded=True)  

graph = graphviz.Source(dot_data) 
graph.render("a")


['eig_0_real', 'eig_0_imag', 'eig_1_real', 'eig_1_imag', 'eig_2_real', 'eig_2_imag', 'eig_3_real', 'eig_3_imag', 'eig_4_real', 'eig_4_imag']


'a.pdf'