In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from habitablePlanets import habitable_planets

exoplanets = pd.read_csv("exoplanets3.csv", sep=",")
training_data = np.array(exoplanets[:])

'''print(training_data.shape)
print(training_data)'''

habitable_x = np.array([np.array(training_data[0, :])])
habitable_y = np.array([0])
uninhabitable_x = np.array([np.array(training_data[0, :])])
uninhabitable_y = np.array([0])

for i in training_data:
    if i[0] in habitable_planets:
        habitable_x = np.append(habitable_x, np.array([np.array(i)]), axis = 0)
        habitable_y = np.append(habitable_y, 1)
        

for i in training_data:
    if i[0] not in habitable_planets:
        uninhabitable_x = np.append(uninhabitable_x, np.array([np.array(i)]), axis = 0)
        uninhabitable_y = np.append(uninhabitable_y, 0)

        
training_x = np.append(habitable_x[:, 1:], uninhabitable_x[:, 1:], axis = 0)
training_y = np.append(habitable_y, uninhabitable_y, axis = 0)

habitable_x = habitable_x[:, 1:]
uninhabitable_x = uninhabitable_x[:, 1:]

print(habitable_x.shape, uninhabitable_x.shape, training_x.shape)
print(habitable_y.shape, uninhabitable_y.shape, training_y.shape)

(60, 25) (4366, 25) (4426, 25)
(60,) (4366,) (4426,)


In [2]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from collections import Counter

training_x = np.asarray(training_x).astype(np.float32)
training_x = np.where(np.isnan(training_x), ma.array(training_x, mask=np.isnan(training_x)).mean(axis=0), training_x)

oversample = SMOTE()
x_res, y_res = oversample.fit_resample(training_x, training_y)
print(x_res.shape, y_res.shape)
print("OG dataset shape: {}".format(Counter(training_y)))
print("New dataset shape: {}".format(Counter(y_res)))

(8734, 25) (8734,)
OG dataset shape: Counter({0: 4367, 1: 59})
New dataset shape: Counter({0: 4367, 1: 4367})


Unnamed: 0,pl_name,sy_snum,sy_pnum,pl_controv_flag,pl_orbper,pl_orbsmax,pl_rade,pl_radj,pl_bmasse,pl_bmassj,...,st_rad,st_mass,st_met,st_lum,st_logg,st_age,st_dens,st_vsin,st_rotp,st_radv
0,11 Com b,2,1,0,326.03,1.29,12.1,1.08,6165.6,19.4,...,19.0,2.7,-0.35,2.243,2.31,,,1.2,,43.37
1,11 UMi b,1,1,0,516.21997,1.53,12.3,1.09,4684.8142,14.74,...,29.79,2.78,-0.02,2.43,1.93,1.56,,1.5,,-17.52
2,14 And b,1,1,0,185.84,0.83,12.9,1.15,1525.5,4.8,...,11.0,2.2,-0.24,1.763,2.63,4.5,,2.6,,-59.73
3,14 Her b,1,2,0,1773.40002,2.93,12.9,1.15,1481.0878,4.66,...,0.93,0.9,0.41,-0.153,4.45,3.9,1.27393,1.0,,-13.82
4,16 Cyg B b,3,1,0,798.5,1.66,13.5,1.2,565.7374,1.78,...,1.13,1.08,0.06,0.097,4.36,7.4,1.01103,2.7,,-28.1


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.2, random_state = 0)

In [21]:
#feature scaling
'''
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)'''

'\nfrom sklearn.preprocessing import StandardScaler\nsc = StandardScaler()\nx_train = sc.fit_transform(x_train)\nx_test = sc.fit_transform(x_test)'

In [34]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=11, random_state=0)
classifier.fit(x_train, y_train)
print(classifier.score(x_train, y_train))
y_pred = classifier.predict(x_test)

0.9998568770573923


In [35]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[861   2]
 [772 112]]
              precision    recall  f1-score   support

           0       0.53      1.00      0.69       863
           1       0.98      0.13      0.22       884

    accuracy                           0.56      1747
   macro avg       0.75      0.56      0.46      1747
weighted avg       0.76      0.56      0.45      1747

0.5569547796222095


In [38]:
#nonsynthetic data
x_test = np.array([np.array(training_data[0, 1:])])
y_test = np.array([0])

hab_lim = 20
cur_hab = 1

for i in training_data:
    if cur_hab >= hab_lim:
        break
    if i[0] in habitable_planets:
        y_test = np.append(y_test, 1)
        cur_hab += 1
    else:
        y_test = np.append(y_test, 0)
    x_test = np.append(x_test, np.array([np.array(i[1:])]), axis = 0)

x_test = np.asarray(x_test).astype(np.float32)
x_test = np.where(np.isnan(x_test), ma.array(x_test, mask=np.isnan(x_test)).mean(axis=0), x_test)
y_test = np.asarray(y_test).astype(np.float32)
y_test = np.where(np.isnan(y_test), ma.array(y_test, mask=np.isnan(y_test)).mean(axis=0), y_test)

y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1416    0]
 [  19    0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      1416
         1.0       0.00      0.00      0.00        19

    accuracy                           0.99      1435
   macro avg       0.49      0.50      0.50      1435
weighted avg       0.97      0.99      0.98      1435

0.9867595818815331


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

1416 0 19 0
