In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

So far, this notebook has 3 topics:
1. Basic Exploration
2. Feature Importance for Kepler Identification
3. Planet Habitability

In [None]:
exo = pd.read_csv("data/df.csv", skiprows=125, delimiter=',')
# exo = pd.read_csv("data/all_df.csv", skiprows=319)
print(exo.shape)
exo.head()

In [None]:
exo.columns.tolist()

## 1. Basic Exploration

In [None]:
def generalize_spectype(row):
      if "B" in row["st_spectype"]:
            return "B"
      elif "A" in row["st_spectype"]:
            return "A"
      elif "F" in row["st_spectype"]:
            return "F"
      elif "G" in row["st_spectype"]:
            return "G"
      elif "K" in row["st_spectype"]:
            return "K"
      elif "M" in row["st_spectype"]:
            return "M"
      else:
            return "Undefined"

exo["st_spectype"].fillna('', inplace=True)
exo["generalized_spectype"] = exo.apply(generalize_spectype, axis=1)
exo

In [None]:
select_columns = ["hostname", "st_spectype", "discoverymethod"]
def plot_basic_characteristic(column, data):

      if column == "st_spectype":

            def bolometric_cor_calc(row):
                  if "B" in row["st_spectype"]:
                        return "B"
                  elif "A" in row["st_spectype"]:
                        return "A"
                  elif "F" in row["st_spectype"]:
                        return "F"
                  elif "G" in row["st_spectype"]:
                        return "G"
                  elif "K" in row["st_spectype"]:
                        return "G"
                  elif "M" in row["st_spectype"]:
                        return "M"
                  else:
                        return "Undefined"

            data["st_spectype"].fillna('', inplace=True)
            data["st_spectype_modified"] = data.apply(bolometric_cor_calc, axis=1)
            column = "st_spectype_modified"
            
      unique_c = data[column].unique().tolist()
      findtop10 = exo.groupby([column]).size().to_frame()
      findtop10 = findtop10.reset_index()
      findtop10.columns = [column, "count"]
      findtop10 = findtop10.sort_values(by=["count"], ascending=False)
      top10list = findtop10[column].iloc[0:10].tolist()
      data = data[data[column].isin(top10list)]
      values = [len(data[data[column] == item]) for item in top10list]
      fig = px.pie(data, values=values, names=top10list)
      fig.show()

plot_basic_characteristic("disc_facility", exo)

In [None]:
def exploratory_pie_chart(col_list):
      pie_fig = go.Figure()
      button_list = []
      for ind, column in enumerate(col_list):
            find_unique_cols = exo[column].nunique()
            labels = list(set(exo[column].tolist()))
            if find_unique_cols > 15:
                findtop10 = exo.groupby([column]).size().to_frame()
                findtop10 = findtop10.reset_index()
                findtop10.columns = [column, "count"]
                findtop10 = findtop10.sort_values(by=["count"], ascending=False)
                top10list = findtop10[column].iloc[0:15].tolist()
                labels = top10list
                values = [len(exo[exo[column] == item]) for item in top10list]
                  
            else:
                values = [len(exo[exo[column] == item]) for item in labels]

            pie_fig.add_trace(go.Pie(labels=labels, values=values, name=column, visible=True if ind == 0 else False))

            col_count = [False] * len(col_list)
            modified_list = []
            for ind_count, col_name in enumerate(col_count):
                  modified_list.append(True if ind_count == ind else False)
            button = dict(label=column, args=[{"visible":modified_list, "title":column}], method="update")
            button_list.append(button)

      pie_fig.update_layout(
            updatemenus=[dict(buttons=list(button_list),
                      direction="down",
                      showactive=True,
                      x=-0.1,
                      xanchor="left",
                      y=1.3,
                      yanchor="top"),
                ],
                title=go.layout.Title(text="Exoplanet Categorical Variable Observation",
                                            x=0.5,
                                            y=0.95),
                                            width=800,
                                            height=500,
                                            autosize=True
                )
                                         
      pie_fig.show()

exploratory_pie_chart(["hostname", "generalized_spectype", "discoverymethod"])

### Exoplanet Classification
- Gas Giant: planets the size of Saturn or Jupiter, the largest planet in our solar system, or much, much larger.
- Neptunian: similar in size to Neptune or Uranus in our solar system. (Neptune is about four times the size, or radius, of Earth and almost 17 times its mass, or weight.)
- Super-Earth: more massive than Earth, but lighter than Neptune.
- Terrestrial: Earth sized and smaller, composed of rock, silicate, water or carbon.

Source: 
- https://exoplanets.nasa.gov/what-is-an-exoplanet/planet-types/overview/#:~:text=So%20far%20scientists%20have%20categorized,%2C%20super%2DEarth%20and%20terrestrial.
- https://iopscience.iop.org/article/10.3847/1538-4357/aab205#apjaab205s4

In [None]:
all_list = {"rocky" : np.arange(0.5, 1.0, step=0.1),
            "super_earth" : np.arange(1.0, 1.75, step=0.1),
            "sub_neptune" : np.arange(1.75, 3.5, step=0.1),
            "sub_jovian" : np.arange(3.5, 6.0, step=0.1),
            "jovians" : np.arange(6.0, 14.3, step=0.1)}

def classify_on_rad(val):
    if val in all_list["rocky"]:
        return "Rocky"
    elif val in all_list["super_earth"]:
        return "Super Earth"
    elif val in all_list["sub_neptune"]:
        return "Sub Neptune"
    elif val in all_list["sub_jovian"]:
        return "Sub Jovian"
    elif val in all_list["jovians"]:
        return "Jovian"
    else:
        return "Undefined"

exo["rad_classification"] = exo["pl_rade"].apply(lambda row: classify_on_rad(row))
display(exo.groupby(["rad_classification"]).size().to_frame())
exo.head()

In [None]:
apply_koi = lambda row: "KOI" if "Kepler" in row["hostname"] else "Exoplanet"
exo["koi_classification"] = exo.apply(apply_koi, axis=1)
exo = exo.replace(np.nan, 0)
display(exo.groupby(["koi_classification"]).size().to_frame())
exo.head()

In [None]:
pie_fig = go.Figure()
pie_fig.add_trace(go.Pie(labels=["Exoplanet", "KOI"], values=exo["koi_classification"].value_counts(), 
                         name="KOI", visible=True))
pie_fig.add_trace(go.Pie(labels=exo["rad_classification"].unique().tolist(),
                         values=exo["rad_classification"].value_counts(),
                        name="Mass", visible=False))
pie_fig.update_layout(
    updatemenus=[dict(buttons=list([dict(label="KOI Classification",
                                         args=[{"visible": [True, False]},
                                               {"title": "KOI Classification"}],
                                         method="update"),
                                    dict(args=[{"visible": [False, True]},
                                               {"title": "Mass Classification"}],
                                         label="Mass Classification",
                                         method="update")
                                   ]),
                      direction="down",
                      showactive=True,
                      x=-0.1,
                      xanchor="left",
                      y=1.3,
                      yanchor="top"),
                ]
)

pie_fig.update_layout(title=go.layout.Title(text="Exoplanet Classifications",
                                            x=0.5,
                                            y=0.95),
                      width=700,
                      height=500,
                      autosize=False)
                                            

pie_fig.show()

## 2. Distinguish KOI
- What factor contributes the most in classifying an exoplanet as KOI?

In [None]:
all_int_cols = list(exo.select_dtypes(include=['int', 'float']).columns)
all_int_cols.append("koi_classification")
koi = exo[all_int_cols]
koi.head()

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(koi.iloc[:, :-1],koi["koi_classification"],test_size=0.3,random_state=0)
# categorical_dtypes = list(X_train.select_dtypes(include=['object']).columns)
# col_transformer = [("cat", encoder, categorical_dtypes)]
# col_transform = ColumnTransformer(transformers=col_transformer)


#### Logistic Regression

In [None]:
def generate_h_bar_fig(df, model_type):
      h_bar = px.bar(df,x="coef", y="feature", orientation="h", height=600, color="color",
                        title=f"{model_type} Feature Importance for KOI Classification")
      h_bar.update_layout(
            title={'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})
      
      h_bar.show()

In [None]:
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train,  y_train)
lr_y_hat = lr_model.predict(X_test)
print(f"LR Model score is: {accuracy_score(y_test, lr_y_hat)}")
lr_importance = list(zip(lr_model.coef_[0], X_train.columns.tolist()))

lr_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in lr_importance:
    lr_df.loc[len(lr_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
lr_df["color"] = lr_df.apply(color, axis=1)
    
display(lr_df.sort_values(by="coef", ascending=False))
generate_h_bar_fig(lr_df.sort_values(by="coef"), "Logistic Regression")

#### Decision Tree

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_importance = list(zip(tree_model.feature_importances_, X_train.columns.tolist()))
tree_y_hat = tree_model.predict(X_test)
print(f"Decision Tree Classifier Model score is: {accuracy_score(y_test, tree_y_hat)}")

tree_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in tree_importance:
    tree_df.loc[len(tree_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
tree_df["color"] = tree_df.apply(color, axis=1)

display(tree_df.sort_values(by="coef", ascending=False))
generate_h_bar_fig(tree_df.sort_values(by="coef"), "Decision Tree")

In [None]:
r_tree_model = RandomForestClassifier()
r_tree_model.fit(X_train, y_train)
r_tree_importance = list(zip(r_tree_model.feature_importances_, X_train.columns.tolist()))
r_tree_y_hat = r_tree_model.predict(X_test)
print(f"Decision Tree Classifier Model score is: {accuracy_score(y_test, r_tree_y_hat)}")

r_tree_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in r_tree_importance:
    r_tree_df.loc[len(r_tree_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
r_tree_df["color"] = r_tree_df.apply(color, axis=1)

r_tree_df.sort_values(by="coef", ascending=False)
generate_h_bar_fig(r_tree_df.sort_values(by="coef"), "Random Forest Classifier")

## 3. Define Habitable Zone
![formula](data/hz_formula_with_e.png)
- Source: https://www.planetarybiology.com/calculating_habitable_zone.html

#### Based on Stellar Luminosity and Radius

In [None]:
solar_lum = np.log(3.828 * (10 ** 26))
earth_rad = 6371000

def revert_to_km_rade(value):
    km_converted = value * 1.496 * (10 ** 8)
    earth_converted = km_converted / earth_rad
    return earth_converted

def calculate_lum(df):
    df["r_inner"] = revert_to_km_rade((df["st_lum"] / 1.1) ** (1/2))
    df["r_outer"] = revert_to_km_rade((df["st_lum"] / 0.53) ** (1/2))
    return df

def identify_habitability(rade, r_inner, r_outer):
    return True if (rade > r_inner) & (rade < r_outer) else False

exo = calculate_lum(exo)
exo["habitability"] = exo.apply(lambda row: identify_habitability(row.pl_rade, row.r_inner, row.r_outer), axis=1)
display(exo.groupby(["habitability"]).size().to_frame())
exo.head()

In [None]:
exo_mass = pd.read_csv("data/planetary_data_with_mass.csv", skiprows=50)
exo_merge = exo.merge(exo_mass[["pl_name", "pl_msinie", "pl_masse"]], on="pl_name", how="outer")
print(exo_merge.shape)
exo_merge["pl_msinie"].count()
exo_merge.head()

In [None]:
exo_masse_non=exo_merge[exo_merge["pl_masse"].notna()]
habitability_fig = px.scatter(exo_masse_non, x="st_lum", y="pl_rade", color="habitability", size="pl_masse", hover_data=["pl_name"],
                              color_discrete_sequence = ["salmon", "blue"],
                              title="Relationship of Radius, Luminosity and Mass(in Earth Unit) in Determining Planet Habitability",
                              labels={"st_lum":"Stellar Luminosity",
                                        "pl_rade": "Planet Radius (in Earth Unit)",
                                        "pl_masse": "Planet Mass (in Earth Unit)",
                                        "habitability": "Habitability"})

habitability_fig.update_layout(
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        height=600)
habitability_fig.show()

In [None]:
def apply_rocky(row):
    return True if ((row["pl_rade"] > 0.5) & (row["pl_rade"] <= 1.6)) or ((row["pl_msinie"] > 0.1) & (row["pl_msinie"] <= 3)) else False

exo_merge["rocky"] = exo_merge.apply(apply_rocky, axis=1)
display(exo_merge.groupby(["rocky"]).size().to_frame())
exo_merge[(exo_merge["rocky"] == True)]

In [None]:
df2 = exo_merge.query("habitability == True")
df2.hostname.value_counts()

In [None]:
from plotly.subplots import make_subplots

def visualize_habitable_characteristics(df, columns):
    df = df.query("habitability == True")
    cols = 2
    rows = int(len(columns)/cols)
    
    fig = make_subplots(cols=cols, rows=rows,
                       subplot_titles=columns)
    
    myrow = 1
    mycol = 1
    mynum = 0

    while mynum < len(columns):
        if myrow <= rows:
            fig.add_trace(go.Histogram(name=columns[mynum], x=df[columns[mynum]]), row=myrow, col=mycol)
            myrow += 1
            mynum += 1
        else: 
            myrow = 1
            mycol = 2
            
    fig.update_layout(height=600, width=800, title_text="Habitable Exoplanets Characteristics")
    
    fig.show()
    
visualize_habitable_characteristics(exo_merge, ["pl_masse", "pl_rade", "pl_insol", "pl_eqt"])    

In [None]:
habitable_df = exo_merge.query("habitability == True")
print(habitable_df.hostname.value_counts())
habitable_df

In [None]:
star_list = habitable_df.hd_name.tolist()

In [None]:
star_df = pd.read_csv("data/star_df.csv", skiprows=30)
star_df

In [None]:
star_df.columns

If we want to further investigate properties of the host stars, we can focus on the items above. 

In [None]:
database_star_list = exo_merge.hd_name.tolist()

present_star = []

for star in star_list:
    if star in database_star_list:
        present_star.append(star)
        
star_df_filtered = star_df[star_df["hd_name"].isin(present_star)]

print(star_df_filtered.columns)
star_df

In [None]:
hd_list = exo_merge.hd_name.unique().tolist()
hip_list = exo_merge.hip_name.unique().tolist()
hd_hip_list = hd_list + hip_list

for ind, item in enumerate(hd_hip_list):
    if str(item) == "nan":
        hd_hip_list.pop(ind)
    
existing_list = []

for ind, star in star_df.iterrows():
    if (star["hd_name"] in hd_hip_list):
        existing_list.append(star["hd_name"])
    elif (star["hip_name"] in hd_hip_list):
        existing_list.append(star["hip_name"])
        
print(len(existing_list))

In [None]:
habitable = lambda row: True if ((row["hd_name"]) or (row["hip_name"])) in existing_list else False
star_df["habitability"] = star_df.apply(habitable, axis=1)
star_df

In [None]:
vmag_bmv = px.scatter(star_df, x="st_vmag", y="st_bmv", color="habitability",
                     color_discrete_sequence = ["salmon", "blue"],
                     title="Brightness vs. Star Color based on Habitability")
vmag_bmv.show()

In the figure above, I've selected the `brightness of the host star` and `color of the host star` to compare whether it has an impact on habitability.

Description of the variables, retrieved from [NASA Exoplanet Database](https://exoplanetarchive.ipac.caltech.edu/docs/API_mission_stars.html).
- `st_vmag`: Brightness of the host star as measured using the V band in the units of magintudes
- `st_bmv`: Color of the star as measured by the difference between B and V bands

In [None]:
num_planets = px.strip(star_df, x="habitability", y="st_ppnum",
                      width=600, height=400,
                      title="Exoplanet Habitability Based on the Number of Planets in System")
num_planets.show()

In [None]:
num_planets = px.strip(star_df, x="habitability", y="st_dist",
                      width=600, height=600, color="habitability",
                       color_discrete_sequence = ["salmon", "blue"],
                       title="Exoplanet Habitability Based on the Distance to the System in parsecs")

num_planets.update_layout(title=dict(font=dict(size=15)))
                          
num_planets.show()