# Solvent Small Experiment

Question: Could be use data from minotiry solvents? Does it add value/relevant information?

Descriptions:
- 3 solvent availables, one majority and two minorities
- sample from majority and minorities: 3 samples
- train in majority
- evaluate in 3 samples and compare results, is there any particular improvement in majority?

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from src.config import chemical_inventory_path, raw_data_path
from src.data import notebook_utils as utils
from src.constants import GBL_INCHI_KEY, DMSO_INCHI_KEY, DMF_INCHI_KEY, \
                        INCHI_TO_CHEMNAME, TARGET_COL, RXN_FEAT_NAME, ORGANOAMONIUM_INCHI_KEY_COL
from src import plot_utils

In [6]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
plt.style.reload_library()
import matplotlib.patches as mpatches

## Generate datasets

Sample from major

In [7]:
%cd ../..

/Users/mticona/Documents/tesis/licentiate-thesis-repo


In [8]:
SEED = 105
ALGORITHM = 'rf'

solvents_inchies = [GBL_INCHI_KEY, DMSO_INCHI_KEY, DMF_INCHI_KEY]
solvents = [INCHI_TO_CHEMNAME[inchie] for inchie in solvents_inchies]

plot_solvents = {'Gamma-Butyrolactone': "GBL",
                 'Dimethyl sulfoxide':"DMSO",
                 'Dimethylformamide': "DMF"}

solvents_data = {INCHI_TO_CHEMNAME[solvent_inchie]: utils.read_data(raw_data_path, organic_key=True,\
                                                                    solvent=solvent_inchie) \
                                                    .drop_duplicates()
                 for solvent_inchie in solvents_inchies}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,col] = (df[TARGET_COL] == 4).astype(int)


In [1208]:
for solvent, data in solvents_data.items():
    print(solvent, len(data[ORGANOAMONIUM_INCHI_KEY_COL].unique()))

Gamma-Butyrolactone 29
Dimethyl sulfoxide 6
Dimethylformamide 9


Grouped by amine

In [14]:
solvent_data_eval_sample = { solvent: data.groupby([TARGET_COL, ORGANOAMONIUM_INCHI_KEY_COL])\
                            .sample(frac=0.2, random_state=SEED)\
                            .drop([ORGANOAMONIUM_INCHI_KEY_COL], axis=1) \
                            for solvent, data in solvents_data.items()}

In [16]:
for solvent, data in solvent_data_eval_sample.items():
    print(solvent, len(data))

Gamma-Butyrolactone 1127
Dimethyl sulfoxide 165
Dimethylformamide 210


In [1211]:
solvent_data_eval_sample["Gamma-Butyrolactone"]

Unnamed: 0,_feat_WienerPolarity,_feat_BondCount,_feat_fr_NH0,_feat_Refractivity,_feat_LargestRingSize,_feat_fr_ArN,_feat_HeteroaliphaticRingCount,_feat_fr_quatN,_feat_msareaVDWp,_feat_AromaticAtomCount,...,_feat_ASA_P,_feat_Protpsa,_feat_fr_NH1,_feat_MaximalProjectionRadius,_feat_Hdonorcount,_feat_fr_pyridine,_rxn_M_inorganic,_rxn_M_organic,_rxn_M_acid,_out_crystalscore
77,2,16,0,35.09,0,0,0,1,173.44,0,...,48.89,27.64,0,4.57,1,0,0.406262,1.019233,3.921577,0
100,2,16,0,35.09,0,0,0,1,173.44,0,...,48.89,27.64,0,4.57,1,0,0.663959,0.714075,3.928258,0
119,2,16,0,35.09,0,0,0,1,173.44,0,...,48.89,27.64,0,4.57,1,0,0.259386,0.843713,2.067347,0
69,2,16,0,35.09,0,0,0,1,173.44,0,...,48.89,27.64,0,4.57,1,0,0.244019,0.542838,4.822942,0
109,2,16,0,35.09,0,0,0,1,173.44,0,...,48.89,27.64,0,4.57,1,0,0.918545,0.722086,6.021877,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6143,7,24,0,46.96,6,0,0,1,233.86,0,...,41.18,27.64,0,4.78,1,0,0.269339,1.157782,0.265576,1
6109,7,24,0,46.96,6,0,0,1,233.86,0,...,41.18,27.64,0,4.78,1,0,0.060753,1.019259,10.251222,1
6075,7,24,0,46.96,6,0,0,1,233.86,0,...,41.18,27.64,0,4.78,1,0,0.329432,1.131126,3.657615,1
6072,7,24,0,46.96,6,0,0,1,233.86,0,...,41.18,27.64,0,4.78,1,0,0.102868,1.088934,5.184504,1


In [1212]:
solvent_data_remain_sample = { solvent: data.loc[~data.index\
                                                 .isin(solvent_data_eval_sample[solvent].index)]\
                            for solvent, data in solvents_data.items()}

In [1213]:
for solvent, data in solvent_data_remain_sample.items():
    print(solvent, len(data))

Gamma-Butyrolactone 4517
Dimethyl sulfoxide 665
Dimethylformamide 838


### Concentration distribution

In [1214]:
concentrations_feats = utils.concentration_feats() + [TARGET_COL]

In [1215]:
full_rxn_sample = pd.concat([data[concentrations_feats] for data in solvents_data.values()], axis=0).reset_index(drop=True)

with plt.style.context(['science', 'bright']):
    light = sns.color_palette()

bright = plot_utils.tol_cset("light")
bright

def plot_pair_solvent(df, title=""):
    
    cols_name_map = {
    '_rxn_M_inorganic': "reactivo inorgánico",
    '_rxn_M_organic': "reactivo orgánico",
    '_rxn_M_acid': "ácido",
    }

    df = df.rename(cols_name_map, axis=1)

    #norm = plt.Normalize(1, 300)
    #sm = plt.cm.ScalarMappable(cmap=sns.cubehelix_palette(full_rxn_sample.max().max(), start=.5, rot=-.75,as_cmap=True), norm=norm)
    #sm.set_array([])
    
    with plt.style.context(['science', 'bright']):
        paleta = [bright.light_blue, bright.orange]
        g = sns.pairplot(df, hue=TARGET_COL, kind='hist',
                         palette=paleta, plot_kws={"bins":33,
                                                   "cbar":False
                                                  }, diag_kws={'bins':15} )
        g.fig.suptitle(title, y=1.01)
        g._legend.remove()
        for ax in g.axes.flatten():
            ax.tick_params(which="both", left=True, bottom=False, top=False, right=False)
        #ax.fig.suptitle(plot_solvents[amine_name], y=1.08)
        pop_a = mpatches.Patch(facecolor=paleta[0], label="No cristaliza", edgecolor='grey')
        pop_b = mpatches.Patch(facecolor=paleta[1], label="Cristaliza", edgecolor='grey')

        plt.legend(handles=[pop_a,pop_b], bbox_to_anchor = (1.7, 1.6))
        
        #cbar_ax = g.fig.add_axes([1.015,0.13, 0.015, 0.8])
        #plt.colorbar(sm, cax=cbar_ax)
        #cbar = plt.colorbar(sm, ax=g.axes)
        g.fig.savefig("figures/solvent_dist_full.pdf", dpi=300) 

import matplotlib as mpl

cmap = mpl.cm.cmaps_listed


schemes = tol_cmap()
tol_cmap(schemes[0])

norm = plt.Normalize(1, 300)
sm = plt.cm.ScalarMappable(cmap=newcmp, norm=norm)
sm.set_array([])

from matplotlib.colors import ListedColormap, LinearSegmentedColormap

colors = ['#2166AC', '#4393C3', '#92C5DE', '#D1E5F0']
newcmp = ListedColormap(colors)

sm = mpl.cm.ScalarMappable(norm=norm, cmap=newcmp)
sm.set_array([])

plot_pair_solvent(full_rxn_sample)

for amine_name in solvents:
    plot_pair_solvent(solvent_data_eval_sample[amine_name][concentrations_feats])

plot_title = "figures/dist_reactivos"

#plotting_rxn = solvent_data_eval_sample.rename(cols_name_map, axis=1)

def binary_name(row):
    return 'Cristaliza' if row[TARGET_COL] == 1 else "No cristaliza"
    
#plotting_rxn["Clase"] = solvent_data_eval_sample.apply(binary_name, axis=1)

with plt.style.context(['science', 'bright']):
    for ax, amine_name in zip(axes.flatten(), solvents):
        ax = sns.pairplot(solvent_data_eval_sample[amine_name] \
                          [concentrations_feats].rename(cols_name_map, axis=1),
                          hue=TARGET_COL, kind='hist')
        ax.tick_params(which="both", left=True, bottom=False, top=False, right=False)
        ax.fig.suptitle(plot_solvents[amine_name], y=1.08)
    #ax.figure.savefig("figures/solvent_%s_dist_rxn.png" % amine_name) 

## Proof of concept models

In [1222]:
raw_cols = utils.get_deafult_model_columns()

import json
f = open('data/metadata/map_chem_feat_names.json')
map_names = json.load(f)

df_amines = pd.read_csv("data/metadata/type_var_fq_bins.csv")

chem_feat = [ col for col in raw_cols if map_names.get(col," ") in list(df_amines.propiedad)]

len(chem_feat)

rxn_feat = utils.concentration_feats()

model_cols = chem_feat + rxn_feat + [TARGET_COL]

In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
import sklearn.ensemble as ensamble_models
import sklearn.neighbors as neighbors_models
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model as linear_models

In [81]:
def make_model(model_name, model_config={}):
    try:
        model_method = getattr(neighbors_models, model_name)
    except AttributeError: 
        try:
            model_method = getattr(ensamble_models, model_name)
        except AttributeError:
            model_method = getattr(linear_models, model_name)
    
    model = model_method(**model_config)
    return model

def split_X_y(df):
    X = df.drop([TARGET_COL], axis=1).values
    y = df[TARGET_COL].values
    return X, y

def proof_concept(model_name, df_train, df_test, model_config = {}):
    model = make_model(model_name, model_config)
    
    X_test, y_test = split_X_y(df_test)
    X_train, y_train = split_X_y(df_train)
    
    pipeline_steps = [('std', StandardScaler()), 
                      ('model', model)
                     ]
    
    pipeline = Pipeline(steps=pipeline_steps)
    
    pipeline.fit(X_train, y_train)
    
    y_pred =  pipeline.predict(X_test)
    
    report = classification_report(y_test, y_pred, labels=[0,1], 
                                   output_dict=True, target_names=["No cristaliza", "Cristaliza"])
    
    report_df =  pd.DataFrame(report).transpose()    
    
    return report_df

def proof_concept_model(model_name, df_train, df_test, model_config = {}):
    model = make_model(model_name, model_config)
    
    X_test, y_test = split_X_y(df_test)
    X_train, y_train = split_X_y(df_train)
    
    pipeline_steps = [('std', StandardScaler()), 
                      ('model', model)
                     ]
    
    pipeline = Pipeline(steps=pipeline_steps)
    
    pipeline.fit(X_train, y_train)
    
    return pipeline
    
def full_pipeline(X, y, model, k_fold_config):
    
    pipeline_steps = [('std', StandardScaler()), 
                      ('model', model)
                     ]
    pipeline = Pipeline(steps=pipeline_steps)
    
    cv = RepeatedStratifiedKFold(**k_fold_config)
    
    scores = cross_validate(pipeline, X, y, cv=cv,
                         scoring=('recall', 'f1', 'precision'),
                         return_train_score=True, return_estimator=True)
    
    #df_res = pd.DataFrame(scores)
    return scores

In [82]:
models = {
    'knn':'KNeighborsClassifier',
    'rf':'RandomForestClassifier',
    'gbc':'GradientBoostingClassifier'
}

n_repeat_k_fold = 2
k_splits = 3

k_fold_config = {
    'random_state': SEED,
    'n_repeats': n_repeat_k_fold,
    'n_splits': k_splits,
}

#### Similar distributions?

In [1226]:
df_train = solvent_data_remain_sample['Gamma-Butyrolactone'][model_cols]

In [1227]:
report_isolated_GBL = {solvent: proof_concept(models[ALGORITHM], df_train, df_test[model_cols]) \
                        for solvent, df_test in solvent_data_eval_sample.items() }

In [1228]:
for solvent in solvents:
    report =  pd.DataFrame(report_isolated_GBL[solvent]).transpose()
    report.to_csv('results3/only_GBL_tested_in_%s_%s_%s.csv' % (solvent, ALGORITHM, SEED), index=None)

#### Discrimination by solvent feature

Generate full dataset with solvent one hot encoded

In [1229]:
def add_column(df, solvent):
    df['solvent'] = solvent
    return df

df_full_with_solvent = pd.concat([data[model_cols].apply(add_column, axis=1, args=(solvent,)) \
                             for solvent, data in solvents_data.items()], axis=0)\
                            .reset_index(drop=True)

In [1230]:
len(df_full_with_solvent.columns)

62

In [1231]:
df_full_dummies = pd.get_dummies(df_full_with_solvent, columns = ['solvent'])

In [1232]:
df_full_dummies['solvent'] = df_full_with_solvent['solvent']

In [1233]:
df_full_dummies = df_full_dummies.drop_duplicates()
df_full_dummies.shape

(7522, 65)

In [1234]:
#df_full_dummies = df_full_dummies[model_cols + ['_out_crystalscore', '_rxn_organic-inchikey',
#       'solvent_Dimethyl sulfoxide', 'solvent_Dimethylformamide',
#       'solvent_Gamma-Butyrolactone', 'solvent']]

In [1235]:
df_full_dummies.to_csv("data/all_solvent_cleaned_data_%s.csv" % SEED, index=None)

In [17]:
df_full_dummies = pd.read_csv("data/all_solvent_cleaned_data_%s.csv" % SEED)

Split by solvent to sample

In [19]:
df_by_solvent= {solvent: df_full_dummies.query('solvent == @solvent')
                        for solvent in solvents
                        }

In [20]:
for solvent, data in solvent_data_eval_sample.items():
    print(len(data))

1127
165
210


In [21]:
def regenerate_sample(original_sample_dict, target_sample_dict):
    "To filter data points from target that are in original sample"
    new_sampling = {}
    for solvent, data in original_sample_dict.items():
        # df1 contains more data points than df2
        df1 = target_sample_dict[solvent].drop(['solvent_Dimethyl sulfoxide',
       'solvent_Dimethylformamide', 'solvent_Gamma-Butyrolactone', 'solvent'], axis=1)
        df2 = original_sample_dict[solvent]
        keys = list(df1.columns.values)
        i1 = df1.set_index(keys).index
        i2 = df2.set_index(keys).index
        new_sampling[solvent] = target_sample_dict[solvent][i1.isin(i2)]
    return new_sampling

In [28]:
df_solvent_samples_test = regenerate_sample(original_sample_dict=solvent_data_eval_sample,
                                            target_sample_dict=df_by_solvent)

In [23]:
df_solvent_samples_test = { solvent: data.drop(['solvent'], axis=1) 
                            for solvent, data in df_solvent_samples_test.items()}

df_solvent_samples_test = { solvent: data.groupby([TARGET_COL])\
                            .sample(frac=0.2, random_state=SEED)\
                            .reset_index(drop=True) \
                            .drop(['solvent'], axis=1) 
                            for solvent, data in df_by_solvent.items()}

In [24]:
for solvent, data in df_solvent_samples_test.items():
    print(solvent, len(data))

Gamma-Butyrolactone 1125
Dimethyl sulfoxide 165
Dimethylformamide 210


In [25]:
df_solvent_samples_train = { solvent: data.loc[~data.index\
                                                 .isin(df_solvent_samples_test[solvent].index)]\
                            for solvent, data in df_by_solvent.items()}

In [26]:
for solvent, data in df_solvent_samples_train.items():
    print(solvent, len(data))

Gamma-Butyrolactone 4519
Dimethyl sulfoxide 665
Dimethylformamide 838


In [38]:
EVAL_MULTISOLVENT = pd.concat([data for solvent, data in df_solvent_samples_test.items()], axis=0)

In [40]:
EVAL_MULTISOLVENT.to_csv("data/solvent-experiment/encoded-solvent-eval-multisolvent_%s.csv" % \
                             (SEED),
                             index=None)

In [44]:
EVAL_MULTISOLVENT_sin_sv = EVAL_MULTISOLVENT.drop(["solvent"], axis=1)

In [74]:
EVAL_MULTISOLVENT

Unnamed: 0,_feat_WienerPolarity,_feat_BondCount,_feat_fr_NH0,_feat_Refractivity,_feat_LargestRingSize,_feat_HeteroaliphaticRingCount,_feat_fr_quatN,_feat_AromaticAtomCount,_feat_AtomCount_C,_feat_fr_amidine,...,_feat_Hdonorcount,_feat_fr_pyridine,_rxn_M_inorganic,_rxn_M_organic,_rxn_M_acid,_out_crystalscore,solvent_Dimethyl sulfoxide,solvent_Dimethylformamide,solvent_Gamma-Butyrolactone,solvent
2,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,0.916561,0.891593,2.583467,1.0,0,0,1,Gamma-Butyrolactone
10,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,1.053470,0.926432,1.547341,0.0,0,0,1,Gamma-Butyrolactone
14,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,0.849390,0.756310,2.583467,0.0,0,0,1,Gamma-Butyrolactone
24,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,0.491659,1.419390,3.007598,0.0,0,0,1,Gamma-Butyrolactone
36,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,0.216236,0.631936,4.194307,0.0,0,0,1,Gamma-Butyrolactone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,2.0,0.0,0.288151,0.499751,8.110363,0.0,0,1,0,Dimethylformamide
7504,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,2.0,0.0,0.642470,1.286300,0.265045,0.0,0,1,0,Dimethylformamide
7514,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,2.0,0.0,0.149412,0.442770,1.007169,0.0,0,1,0,Dimethylformamide
7515,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,2.0,0.0,0.434559,0.728326,1.851609,0.0,0,1,0,Dimethylformamide


Generate train dataset merging train samples from all solvent

In [72]:
df_solvent_full_train = pd.concat([data for solvent, data in df_solvent_samples_train.items()],
                                  axis=0).reset_index(drop=True)

In [1245]:
df_solvent_full_train = df_solvent_full_train.drop(['solvent'], axis=1)

In [1246]:
df_solvent_full_train.to_csv("data/solvent-experiment/encoded-solvent-training_%s.csv" % \
                             (SEED),
                             index=None)

In [11]:
df_solvent_full_train = pd.read_csv("data/solvent-experiment/encoded-solvent-training_%s.csv" % \
                             (SEED))

In [62]:
ALGORITHM = 'gbc'

In [63]:
results_experiment_1 = proof_concept(models[ALGORITHM], df_solvent_full_train, EVAL_MULTISOLVENT_sin_sv)

In [None]:
results_experiment_1

In [64]:
results_experiment_s_sv = exp_drop_solvent(models[ALGORITHM], df_solvent_full_train, EVAL_MULTISOLVENT_sin_sv)

In [67]:
report_multisv =  pd.DataFrame(results_experiment_1).transpose()
report_no_sol =  pd.DataFrame(results_experiment_s_sv).transpose()

In [68]:
report_multisv.to_csv("results_exp_1/eval_multisolvente_encoded_%s" % SEED, index=None)

Unnamed: 0,precision,recall,f1-score,support
No cristaliza,0.876822,0.977254,0.924318,1231.0
Cristaliza,0.78125,0.371747,0.503778,269.0
accuracy,0.868667,0.868667,0.868667,0.868667
macro avg,0.829036,0.674501,0.714048,1500.0
weighted avg,0.859683,0.868667,0.848901,1500.0


In [69]:
report_no_sol..to_csv("results_exp_1/eval_no_solvent_%s" % SEED, index=None)

Unnamed: 0,precision,recall,f1-score,support
No cristaliza,0.875,0.978067,0.923667,1231.0
Cristaliza,0.782258,0.360595,0.493639,269.0
accuracy,0.867333,0.867333,0.867333,0.867333
macro avg,0.828629,0.669331,0.708653,1500.0
weighted avg,0.858368,0.867333,0.846549,1500.0


In [53]:
def exp_drop_solvent(model_name, df_train, df_test, model_config = {}):
    drop_cols = ['solvent_Dimethyl sulfoxide', 
                 'solvent_Dimethylformamide',
                 'solvent_Gamma-Butyrolactone']
    df_train = df_train.drop(drop_cols, axis=1)
    df_test = df_test.drop(drop_cols, axis=1)
    return proof_concept(model_name, df_train, df_test)

In [70]:
df_solvent_full_train

Unnamed: 0,_feat_WienerPolarity,_feat_BondCount,_feat_fr_NH0,_feat_Refractivity,_feat_LargestRingSize,_feat_HeteroaliphaticRingCount,_feat_fr_quatN,_feat_AromaticAtomCount,_feat_AtomCount_C,_feat_fr_amidine,...,_feat_MaximalProjectionRadius,_feat_Hdonorcount,_feat_fr_pyridine,_rxn_M_inorganic,_rxn_M_organic,_rxn_M_acid,_out_crystalscore,solvent_Dimethyl sulfoxide,solvent_Dimethylformamide,solvent_Gamma-Butyrolactone
0,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,4.57,1.0,0.0,0.548189,1.330881,4.461583,0.0,0,0,1
1,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,4.57,1.0,0.0,0.449406,1.239565,4.454159,0.0,0,0,1
2,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,4.57,1.0,0.0,0.601135,0.741849,6.021877,0.0,0,0,1
3,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,4.57,1.0,0.0,0.252087,0.805647,4.417409,0.0,0,0,1
4,2.0,16.0,0.0,35.09,0.0,0.0,1.0,0.0,4.0,0.0,...,4.57,1.0,0.0,0.239683,0.555402,4.410059,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6015,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,5.15,2.0,0.0,0.189587,0.284835,13.384485,0.0,0,1,0
6016,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,5.15,2.0,0.0,0.362857,0.983597,2.915490,0.0,0,1,0
6017,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,5.15,2.0,0.0,0.010672,0.508771,11.927004,0.0,0,1,0
6018,3.0,19.0,0.0,49.96,0.0,0.0,2.0,0.0,4.0,0.0,...,5.15,2.0,0.0,0.108857,0.987078,1.961330,0.0,0,1,0


In [76]:
solvents

['Gamma-Butyrolactone', 'Dimethyl sulfoxide', 'Dimethylformamide']

In [91]:
# Experimento 2
for solvent in solvents:
    drop_cols = ['solvent_Dimethyl sulfoxide', 
                 'solvent_Dimethylformamide',
                 'solvent_Gamma-Butyrolactone',
                'solvent']
    
    df_train_sv = df_solvent_full_train[df_solvent_full_train.solvent == solvent].drop(drop_cols, axis=1)
    
    df_test_sv = EVAL_MULTISOLVENT[EVAL_MULTISOLVENT.solvent == solvent].drop(drop_cols, axis=1)
    
    df_test_sv_encoded = EVAL_MULTISOLVENT[EVAL_MULTISOLVENT.solvent == solvent].drop('solvent', axis=1)
    df_train_sv_encoded = df_solvent_full_train.drop('solvent', axis=1)
    
    # modelo específico por solvente
    report_sv = proof_concept(models[ALGORITHM], df_train_sv, df_test_sv)
    
    # misma muestra en modelo multisolvente
    report_multisv = proof_concept(models[ALGORITHM], df_train_sv_encoded, df_test_sv_encoded)
    
    report_sv.to_csv('results_exp_2/model_per_solv_%s_%s.csv' % (SEED, solvent), index=None)
    report_multisv.to_csv('results_exp_2/sample_%s_%s_in_multisolvent.csv' % (SEED, solvent), index=None)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1249]:
## Experiment solvent dropped

# Data from each solvent evaluated in model trained on multisolvent data witout sv feature
# SIN_SOLV model
df_solvent_report = {solvent: exp_drop_solvent(models[ALGORITHM], df_solvent_full_train, df_test) \
                        for solvent, df_test in df_solvent_samples_test.items() }

In [1250]:
for solvent in solvents:
    report =  pd.DataFrame(df_solvent_report[solvent]).transpose()
    report.to_csv('results3/solvent-dropped_%s_%s_%s.csv' % (solvent, ALGORITHM, SEED), index=None)

In [1057]:
## Experiment solvent encoded
df_solvent_report = {solvent: proof_concept(models[ALGORITHM], df_solvent_full_train, df_test) \
                        for solvent, df_test in df_solvent_samples_test.items() }

for solvent in solvents:
    report =  pd.DataFrame(df_solvent_report[solvent]).transpose()
    report.to_csv('results2/solvent-encoded_%s_%s_%s.csv' % (solvent, ALGORITHM, SEED), index=None)

Features importances only computed for RandomForest

In [1058]:
ALGORITHM

'rf'

In [1059]:
df_solvent_models = {solvent: proof_concept_model(models[ALGORITHM], df_solvent_full_train, df_test) \
                        for solvent, df_test in df_solvent_samples_test.items() }

model_cols = list(df_solvent_full_train.columns)
model_cols.remove(TARGET_COL)

df_importance = pd.DataFrame({"feature": model_cols,
                             "importance": df_solvent_models['Gamma-Butyrolactone']['model'].feature_importances_})
df_importance = df_importance.sort_values(by="importance", ascending=False)

In [1060]:
#df_importance.to_csv("results/solvent-exp/random_forest_feat_importances.csv", index=None)

df_importance = pd.read_csv("random_forest_feat_importances.csv")

In [1061]:
#df_importance.reset_index().head(50)

plot_title = "figures/feature_importances.pdf"
def plot_df_importances(df):
    with plt.style.context(['science', 'bright']):
        #sns.set_palette(sns.color_palette(color_bright))
        fig, axes = plt.subplots(1, 1, figsize=(8,5),
                                     constrained_layout = True)
        ax = sns.barplot(x="importance", y="feature", orient="h", data=df,
                         palette=df.color, alpha=0.7)
        #ax.set_title("Distribución Calidad de Cristal")

        # Remove box lines
        sns.despine(bottom = True, left = False, trim=False)
        ax.tick_params(which="both", left=False, bottom=False, top=False, right=False)
        #ax.xticks(fontdict={'fontsize':15})
        #ax.tick_params(axis='both', which='both')
        ax.set_ylabel("Importancia", fontdict={'fontsize':13})
        ax.set_xlabel("Variable", fontdict={'fontsize':13}, rotation=0)
        #ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
        #plt.xticks()
        plt.xticks(fontsize=11)
        plt.yticks(fontsize=12)


    #        ax.annotate(text, (x, y), ha='center', va='center', fontsize=12)

        #plt.legend(title="Tipo de Variable", fontsize=13)
        plt.xlim(0,0.015)
        fig.savefig(plot_title, dpi=300)

In [1062]:
# fronteroa  de toma de posición 
# misma superficie no cambia tanto 
# por qupe tan bajo peso

# combinar modelos
# binarización más fuerte

indexes = np.arange(0,60,6)

tmp = df_importance.iloc[3:,:].reset_index(drop=True)
#.reset_index(drop=True).loc[indexes]

tmp['color'] = np.where(tmp['feature'].str.startswith("solvent"), 'red', 'grey')

solvent_index = tmp[tmp.feature.str.startswith("solvent")].index

plot_df_importances(tmp.iloc[list(indexes) + list(solvent_index)]\
                    .sort_values(by="importance", ascending=False))