## Habitable Planet Data Cleaning

In [2]:
import pandas as pd
import numpy as np

In [32]:
k_data = pd.read_csv("planet.csv")
k_data.columns = ['planet_name', 'num_stars', 'controv_flag', 'orbital_period', 'planet_semi-major_axis', 'planet_radius', 'planet_mass', 'eccentricity', 'insolation_flux', 'equi_temp', 'spectral_type', 'stellar_temp', 'stellar_radius', 'stellar_mass', 'distance']

In [54]:
k_data.head()

Unnamed: 0,planet_name,num_stars,controv_flag,orbital_period,planet_semi-major_axis,planet_radius,planet_mass,eccentricity,insolation_flux,equi_temp,spectral_type,stellar_temp,stellar_radius,stellar_mass,distance
0,11 Com b,2,0,,1.21,,5434.7,,,,,,,2.6,93.1846
1,11 Com b,2,0,326.03,1.29,,6165.6,0.231,,,G8 III,4742.0,19.0,2.7,93.1846
2,11 Com b,2,0,323.21,1.178,,4914.89849,0.238,,,G8 III,4874.0,13.76,2.09,93.1846
3,11 UMi b,1,0,516.21997,1.53,,4684.8142,0.08,,,,4213.0,29.79,2.78,125.321
4,11 UMi b,1,0,,1.51,,3432.4,,,,,,,1.7,125.321


In [56]:
#Total rows
k_data['planet_name'].count()

35896

In [57]:
#Total planets
k_data['planet_name'].nunique()

5602

In [44]:
#Take duplicates and average the data
avg_data = k_data.groupby('planet_name').mean(numeric_only=True).reset_index()
avg_data.reset_index(drop=True, inplace=True)
avg_data.tail()

Unnamed: 0,planet_name,num_stars,controv_flag,orbital_period,planet_semi-major_axis,planet_radius,planet_mass,eccentricity,insolation_flux,equi_temp,stellar_temp,stellar_radius,stellar_mass,distance
5597,ups And b,2.0,0.0,4.616229,0.058895,,219.960419,0.0302,,,6137.82,1.615,1.25,13.4054
5598,ups And c,2.0,0.0,240.728533,0.827712,,1275.328758,0.238933,,,6117.093333,1.615,1.2625,13.4054
5599,ups And d,2.0,0.0,1285.346167,2.528382,,1616.24259,0.281117,,,6117.093333,1.615,1.2625,13.4054
5600,ups Leo b,1.0,0.0,385.2,1.18,,162.09249,0.32,,,4836.0,11.22,1.48,52.5973
5601,xi Aql b,1.0,0.0,136.86,0.628333,,716.301647,0.0295,,,4810.5,11.43,1.78,56.1858


In [47]:
# Checking NAN counts for each row.
for column in avg_data.columns:
    nan_count = avg_data[column].isna().sum()
    print(f"Number of NaN values in '{column}': {nan_count}")

Number of NaN values in 'planet_name': 0
Number of NaN values in 'num_stars': 0
Number of NaN values in 'controv_flag': 0
Number of NaN values in 'orbital_period': 249
Number of NaN values in 'planet_semi-major_axis': 296
Number of NaN values in 'planet_radius': 1361
Number of NaN values in 'planet_mass': 2907
Number of NaN values in 'eccentricity': 777
Number of NaN values in 'insolation_flux': 1710
Number of NaN values in 'equi_temp': 1462
Number of NaN values in 'stellar_temp': 392
Number of NaN values in 'stellar_radius': 535
Number of NaN values in 'stellar_mass': 28
Number of NaN values in 'distance': 118


In [51]:
#Remove rows that have controversy
avg_data_filtered = avg_data.loc[avg_data['controv_flag'] != 1]
avg_data_filtered.reset_index(drop=True, inplace=True)
avg_data_filtered['controv_flag'].nunique()

1

In [59]:
proj_data = avg_data_filtered
proj_data.head()

Unnamed: 0,planet_name,num_stars,controv_flag,orbital_period,planet_semi-major_axis,planet_radius,planet_mass,eccentricity,insolation_flux,equi_temp,stellar_temp,stellar_radius,stellar_mass,distance
0,11 Com b,2.0,0.0,324.62,1.226,,5505.066163,0.2345,,,4808.0,16.38,2.463333,93.1846
1,11 UMi b,1.0,0.0,516.219985,1.526667,,3818.094733,0.08,,,4276.5,26.935,2.093333,125.321
2,14 And b,1.0,0.0,186.3,0.761667,,1224.550433,0.0,,,4850.5,11.275,1.726667,75.4392
3,14 Her b,1.0,0.0,1766.378417,2.81475,,1642.383591,0.362925,,,5296.985,0.976667,0.927143,17.9323
4,16 Cyg B b,3.0,0.0,799.375,1.662833,,533.514528,0.676033,,,5728.594,1.14,1.016,21.1397
