In [1]:
import numpy as np
import wisps
import splat
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

import seaborn as sns
import pandas as pd

In [2]:
train_df=pd.read_pickle(wisps.LIBRARIES+'/training_set.pkl').rename(columns={'f_test':'ftest',
                                                                             'line_chi': 'linechi',
                                                                             'spex_chi':'spexchi'})
pred_df=wisps.Annotator.reformat_table(wisps.datasets['stars']).rename(columns={'f_test':'ftest',
                                                                             'line_chi': 'linechi',
                                                                             'spex_chi':'spexchi'})

In [3]:
cands=pd.read_pickle(wisps.OUTPUT_FILES+'/true_spectra_cands.pkl')

In [4]:
cands['grism_id']=cands.grism_id.apply(lambda x: x.lower())
pred_df['grism_id']=pred_df.grism_id.apply(lambda x: x.lower())

In [5]:
features=['ftest', 'spt']

In [6]:
pred_df=pred_df[pred_df.snr2>3.]

train_df['spt']=train_df.spt.apply(wisps.make_spt_number)
pred_df['spt']=pred_df.spt.apply(wisps.make_spt_number)


In [7]:
labels=train_df.label

In [8]:
scaler = MinMaxScaler(feature_range=(0, 1))
#train_set=train_df[features]

X_train, X_test, y_train, y_test = train_test_split(train_df[features].values,labels,
                                                    test_size=0.5)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#scale the data set to predict for 
pred_set=scaler.transform(pred_df[features])

#cleanup
X_train.shape

(4209, 2)

In [9]:
rf = RandomForestClassifier(n_estimators=1000, oob_score=True, 
                            min_samples_split=100, verbose=True)
rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    2.0s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=True, warm_start=False)

In [10]:
np.isnan(X_test).any(), np.isinf(X_test).any(),

(False, False)

In [11]:
pred_labels = rf.predict(X_test)
model_accuracy = accuracy_score(y_test, pred_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.3s finished


In [12]:
'accuracy score {}'.format(model_accuracy)

'accuracy score 0.961520190023753'

In [13]:
rlabels=rf.predict(pred_set)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    2.5s finished


In [14]:
len(rlabels[rlabels==1]), len(rlabels[rlabels==0])

(395, 60362)

In [15]:
truep=len(pred_df[(rlabels==1) & pred_df.grism_id.isin(cands.grism_id.values)])
ps=len(rlabels[rlabels==1])

In [16]:
truep

23

In [17]:
'FP rate {}'.format((ps-truep)/ps)

'FP rate 0.9417721518987342'

In [18]:
rf_dict={'classifier': rf,
            'sclr':scaler,
            'feats':features}

In [19]:
import pickle

In [20]:
#save the random forest
output_file=wisps.OUTPUT_FILES+'/random_forest_classifier.pkl'
with open(output_file, 'wb') as file:
    pickle.dump(rf_dict,file)

In [21]:
sv_df=pred_df[(rlabels==1)]

In [22]:
sv_df.to_pickle(wisps.LIBRARIES+'/labelled_by_rf.pkl')