In [1]:
import pandas as pd
import numpy as np

import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from dml import LMNN, NCA, LDA, MultiDML_kNN, KLLDA, DML_eig
from dml import LSI
from dml import MCML
from dml import tune_knn

from joblib import dump, load

from sklearn.model_selection import train_test_split

In [4]:
DATAFILE = '/home/jupyter/sd2e-community/versioned-dataframes\
/perovskite/perovskitedata/0057.perovskitedata.csv'

In [5]:
DATAFILE

'/home/jupyter/sd2e-community/versioned-dataframes/perovskite/perovskitedata/0057.perovskitedata.csv'

In [36]:
def _stratify(data0, out, inchis, sampleCutoff):

    stratifiedData0 = pd.DataFrame()
    stratifiedOut = pd.DataFrame()
    
    indicies = {}
    for i, x in enumerate(np.unique(inchis.values.flatten())):
        z = (inchis.values == x).flatten()
        # print(x, z.sum())
        if z.sum() < sampleCutoff:
            continue
        total_amine0 = data0[z].reset_index(drop=True)

        amine_out = out[z].reset_index(drop=True)

        # this is still experimental and can easily be changed.
        uniformSamples = np.random.choice(total_amine0.index, size=sampleCutoff, replace=False)
        sampled_amine0 = total_amine0.loc[uniformSamples]
        
        sampled_out = amine_out.loc[uniformSamples]

        # save pointer to where this amine lives in the stratified dataset.
        # this isn't needed for random-TTS, but makes doing the Leave-One-Amine-Out 
        # train-test-splitting VERY EASY. 
        indicies[x] = np.array(range(96)) + i*96

        stratifiedData0 = pd.concat([stratifiedData0, sampled_amine0]).reset_index(drop=True)
        stratifiedOut = pd.concat([stratifiedOut, sampled_out]).reset_index(drop=True)
        
    stratifiedOut = np.array(stratifiedOut, dtype=int)
    return stratifiedData0, stratifiedOut.squeeze(), indicies

In [7]:
def _prepare(minimal=False, sampleCutoff=95):
    
    perov = pd.read_csv(DATAFILE, skiprows=4, low_memory=False)
    perov = perov[perov['_raw_expver'] == 1.1].reset_index(drop=True)       
    perov = perov[perov['_raw_reagent_0_chemicals_0_inchikey'] 
                  == "YEJRWHAVMIAJKC-UHFFFAOYSA-N"].reset_index(drop=True)
    # removes three reactions
    perov = perov[perov['_rxn_organic-inchikey'] != 'JMXLWMIFDJCGBV-UHFFFAOYSA-N'].reset_index(drop=True)    
    
    newInchis = perov['_rxn_organic-inchikey'].dropna()
    perov = perov.iloc[newInchis.index].reset_index(drop=True)
    
    inchis = pd.DataFrame.from_dict({"inchis":perov['_rxn_organic-inchikey'].values})
        
    cleanPerov = perov.drop(labels=[raw for raw in perov.columns if "raw" in raw], axis=1)
    cleanPerov = cleanPerov.select_dtypes(exclude=['object'])
    
    cleanPerov.fillna(0, inplace=True)
    cleanPerov['_out_crystalscore'] = np.where(cleanPerov['_out_crystalscore'] == 4, 1, 0)
    out = cleanPerov['_out_crystalscore']
    cleanPerov.drop(["_out_crystalscore", 'dataset'], axis=1, inplace=True)
        
    return  _stratify(cleanPerov, out, inchis, sampleCutoff)

In [37]:
perov = pd.read_csv('minimal57Perov.csv')
perov = perov[perov['_raw_RelativeHumidity'] != -1].reset_index(drop=True)
inchis = pd.DataFrame.from_dict({"inchis":perov['_rxn_organic-inchikey'].values})
perov.fillna(0, inplace=True)

perov['_out_crystalscore'] = np.where(perov['_out_crystalscore'] == 4, True, False) + \
                             np.where(perov['_out_crystalscore'] == 3, True, False)

out = perov['_out_crystalscore']
perov = perov.select_dtypes(exclude=['object'])
perov.drop(["_out_crystalscore"], axis=1, inplace=True)

stratPerov, stratOut, indicies = _stratify(perov, out, inchis, 95)

In [31]:
print(stratPerov.shape)
print(stratOut.shape)

(3800, 52)
(3800,)


In [10]:
lmnn = LMNN(k=5, solver='SDP', tol=1e-8, max_iter=100)
nca = NCA(max_iter=150, tol=1e-5)
lda = LDA()
kllda = KLLDA() 
dml_eig = DML_eig()

In [None]:
# nca.fit(stratPerov.values, stratOut)
# nca.metadata()
# dump(nca, 'nca_limited_features.joblib')

# lda.fit(stratPerov.values, stratOut)
# lda.metadata()
# dump(lda, 'lda_limited_features.joblib')

In [None]:
results,best,nca_best,detailed = tune_knn(NCA,
                                          X=stratPerov.values, 
                                          y=stratOut,
                                          n_neighbors=5,
                                          dml_params={'learning_rate':'constant'},
                                          tune_args={'num_dims':[2,3,None],'eta0':[0.01,0.1]},
                                          # metrics=[1,3,5,'final_expectance'],
                                          n_folds=5,n_reps=2,seed=28,verbose=True)

In [None]:
results

In [None]:
detailed

In [None]:
f = open("./distanceResults/NCA.pkl","wb+")
pickle.dump(detailed, f)
f.close()

In [None]:
dump(nca_best, "./distanceResults/NCA_best.joblib")

In [None]:
nca_best.num_dims_

----  

In [None]:
mcml_results, mcml_best, mcml_best, mcml_detailed = tune_knn(MCML,
                                                        X=stratPerov.values, 
                                                        y=stratOut,
                                                        n_neighbors=5,
                                                        dml_params={'learning_rate':'adaptive'},
                                                        tune_args={'initial_metric':['euclidean', 'scale']},                                       # metrics=[1,3,5,'final_expectance'],
                                                        n_folds=5,n_reps=2,seed=28,verbose=True)

In [None]:
f = open("./distanceResults/MCML.pkl","wb+")
pickle.dump(mcml_detailed, f)
f.close()

dump(mcml_best, "./distanceResults/mcml_best.joblib")

In [38]:
euclidean_score = []
for i in range(10):
    print(i)
    xtrain, xtest, ytrain, ytest = train_test_split(stratPerov.values, stratOut,
                                                    random_state=np.random.choice(range(1000)))
    mknn = MultiDML_kNN(n_neighbors=5, dmls=[lda])
    mknn.fit(xtrain, ytrain)
    # print(mknn)
    euc, _ = mknn.score_all(xtest, ytest)
    euclidean_score.append(euc)

mean = np.mean(euclidean_score)
std = np.std(euclidean_score)

0
1
2
3
4
5
6
7
8
9


In [39]:
print(mean, std)

0.8594736842105263 0.009798524395663382


In [40]:
f = open("./distanceResults/KNN_3and4.pkl","wb+")
pickle.dump({"SCORE":{'MEAN':mean, 'STD':std}}, f)
f.close()

For fun: 
    (3&4) vs not-4
    Drop -1's in the humidity (because learning distance matrix)
    Standardizing input data
    
    
    

In [None]:
lmnn.fit(stratPerov.values, stratOut)

In [None]:
dump(lmnn, 'lmnn_limited_features.joblib')

In [None]:
knn.predict()