In [1]:
from google.colab import files
uploaded = files.upload()

Saving NASA Exoplanet Archive_Raw.csv to NASA Exoplanet Archive_Raw.csv


In [4]:
import pandas as pd

# --- 1. Load your raw exoplanet dataset ---
# Make sure your CSV file 'NASA Exoplanet Archive_Raw.csv' is uploaded to your Colab environment.
data_path = "/content/NASA Exoplanet Archive_Raw.csv"
print(f"Loading data from: {data_path}")

try:
    df_raw = pd.read_csv(data_path)
    print(f"Successfully loaded {len(df_raw)} rows and {len(df_raw.columns)} columns from the raw file.")
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found. Please upload it to your Colab session.")
    # Exit the script if the file isn't found
    exit()

# --- 2. Define the list of essential columns to keep ---
# These are the 12 columns absolutely necessary for your habitability scoring metric,
# including primary features and necessary fallback calculation parameters.
essential_columns = [
    'pl_name',          # Planet name (for identification)
    'pl_rade',          # Planet Radius (Earth radii)
    'pl_masse',         # Planet Mass (Earth masses)
    'pl_eqt',           # Planet Equilibrium Temperature (Kelvin) - Used as surface temp
    'pl_insol',         # Stellar Flux (W/m²)
    'pl_orbper',        # Planet Orbital Period (days) - For flux fallback
    'pl_orbsmax',       # Planet Semi-Major Axis (AU) - For flux fallback
    'st_spectype',      # Stellar Spectral Type - For M-dwarf identification
    'st_mass',          # Stellar Mass (Solar masses) - For flux fallback
    'st_lum',           # Stellar Luminosity (Solar luminosities) - For flux fallback
    'st_rad',           # Stellar Radius (Solar radii) - For stellar luminosity fallback
    'st_teff'           # Stellar Effective Temperature (Kelvin) - For stellar luminosity fallback & M-dwarf
]

# --- 3. Filter the DataFrame to keep ONLY the essential columns ---
# This operation effectively deletes all columns that are NOT in the essential_columns list.
df_filtered = df_raw[essential_columns].copy()

print(f"\nFiltered DataFrame now has {len(df_filtered.columns)} essential columns and {len(df_filtered)} rows.")

# --- 4. Display the first few rows of the filtered data (optional) ---
print("\nFirst 5 rows of the filtered DataFrame:")
print(df_filtered.head())

# --- 5. Save the filtered DataFrame ---
# It's highly recommended to save this cleaned data to a new CSV file.
# You can then use this 'NASA_Exoplanet_Archive_Cleaned.csv' file for your scoring script.
output_path = "/content/NASA_Exoplanet_Archive_Cleaned.csv"
df_filtered.to_csv(output_path, index=False)
print(f"\nCleaned data with only essential columns saved to: {output_path}")

print("\nYour dataset is now perfectly prepared for the Top 100 scoring script using 'NASA_Exoplanet_Archive_Cleaned.csv'.")

Loading data from: /content/NASA Exoplanet Archive_Raw.csv


  df_raw = pd.read_csv(data_path)


Successfully loaded 38749 rows and 121 columns from the raw file.

Filtered DataFrame now has 12 essential columns and 38749 rows.

First 5 rows of the filtered DataFrame:
    pl_name  pl_rade  pl_masse  pl_eqt  pl_insol  pl_orbper  pl_orbsmax  \
0  11 Com b      NaN       NaN     NaN       NaN     323.21       1.178   
1  11 Com b      NaN       NaN     NaN       NaN        NaN       1.210   
2  11 Com b      NaN       NaN     NaN       NaN     326.03       1.290   
3  11 UMi b      NaN       NaN     NaN       NaN        NaN       1.510   
4  11 UMi b      NaN       NaN     NaN       NaN     516.22       1.540   

  st_spectype  st_mass   st_lum  st_rad  st_teff  
0      G8 III     2.09  1.97823   13.76   4874.0  
1         NaN     2.60      NaN     NaN      NaN  
2      G8 III     2.70  2.24300   19.00   4742.0  
3         NaN     1.70      NaN     NaN      NaN  
4      K4 III     1.80      NaN   24.08   4340.0  

Cleaned data with only essential columns saved to: /content/NASA_Exopl

In [5]:
import pandas as pd

# --- 1. Define the path to your cleaned dataset ---
# This assumes you have already run the previous code to filter columns
# and saved the output as 'NASA_Exoplanet_Archive_Cleaned.csv'.
data_path = "/content/NASA_Exoplanet_Archive_Cleaned.csv"
print(f"Loading data from: {data_path}")

try:
    df_cleaned = pd.read_csv(data_path)
    print(f"Successfully loaded {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns.")
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found. Please ensure it was created by the previous step.")
    # Exit the script if the file isn't found
    exit()

# --- 2. Remove duplicate rows ---
# By default, subset=None means drop_duplicates will consider ALL columns
# to identify duplicate rows. 'keep='first' means it will keep the first
# occurrence of any duplicate set of rows.
initial_rows = len(df_cleaned)
df_deduplicated = df_cleaned.drop_duplicates(keep='first').copy()
rows_after_deduplication = len(df_deduplicated)

print(f"\nInitial number of rows: {initial_rows}")
print(f"Number of rows after removing duplicates: {rows_after_deduplication}")
print(f"Number of duplicate rows removed: {initial_rows - rows_after_deduplication}")

# --- 3. Overwrite the original 'NASA_Exoplanet_Archive_Cleaned.csv' file ---
# This step saves the deduplicated DataFrame back to the original file path,
# effectively updating the 'cleaned' file to contain only unique rows.
df_deduplicated.to_csv(data_path, index=False)
print(f"\nDeduplicated data saved back to: {data_path}")

print("\nYour 'NASA_Exoplanet_Archive_Cleaned.csv' file has been updated and now contains only unique rows, ready for scoring.")

Loading data from: /content/NASA_Exoplanet_Archive_Cleaned.csv
Successfully loaded 38749 rows and 12 columns.

Initial number of rows: 38749
Number of rows after removing duplicates: 33272
Number of duplicate rows removed: 5477

Deduplicated data saved back to: /content/NASA_Exoplanet_Archive_Cleaned.csv

Your 'NASA_Exoplanet_Archive_Cleaned.csv' file has been updated and now contains only unique rows, ready for scoring.


In [6]:
import pandas as pd

# --- 1. Define the path to your cleaned dataset ---
# This assumes you have already run the previous code to filter columns
# and save the output as 'NASA_Exoplanet_Archive_Cleaned.csv'.
data_path = "/content/NASA_Exoplanet_Archive_Cleaned.csv"
print(f"Loading data from: {data_path}")

try:
    df_cleaned = pd.read_csv(data_path)
    print(f"Successfully loaded {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns.")
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found. Please ensure it was created by the previous step.")
    # Exit the script if the file isn't found
    exit()

# --- 2. Remove duplicate planets based on 'pl_name' ---
# The 'subset=['pl_name']' argument tells drop_duplicates to only consider
# the 'pl_name' column when identifying duplicate rows.
# 'keep='first'' means if a planet name appears multiple times, the first
# occurrence in the DataFrame will be kept, and subsequent occurrences will be dropped.
initial_rows = len(df_cleaned)
df_deduplicated_by_name = df_cleaned.drop_duplicates(subset=['pl_name'], keep='first').copy()
rows_after_deduplication = len(df_deduplicated_by_name)

print(f"\nInitial number of rows: {initial_rows}")
print(f"Number of rows after removing duplicate planet names: {rows_after_deduplication}")
print(f"Number of duplicate planet names removed: {initial_rows - rows_after_deduplication}")

# --- 3. Overwrite the original 'NASA_Exoplanet_Archive_Cleaned.csv' file ---
# This step saves the deduplicated DataFrame back to the original file path,
# effectively updating the 'cleaned' file to contain only unique planet names.
df_deduplicated_by_name.to_csv(data_path, index=False)
print(f"\nDeduplicated data saved back to: {data_path}")

print("\nYour 'NASA_Exoplanet_Archive_Cleaned.csv' file has been updated and now contains only unique planet entries by name, ready for scoring.")

Loading data from: /content/NASA_Exoplanet_Archive_Cleaned.csv
Successfully loaded 33272 rows and 12 columns.

Initial number of rows: 33272
Number of rows after removing duplicate planet names: 5983
Number of duplicate planet names removed: 27289

Deduplicated data saved back to: /content/NASA_Exoplanet_Archive_Cleaned.csv

Your 'NASA_Exoplanet_Archive_Cleaned.csv' file has been updated and now contains only unique planet entries by name, ready for scoring.


In [7]:
from google.colab import files

# Define the path to the file you want to download
# This assumes the file 'NASA_Exoplanet_Archive_Cleaned.csv'
# exists in your Colab content directory after previous steps.
file_to_download = "/content/NASA_Exoplanet_Archive_Cleaned.csv"

print(f"Attempting to download '{file_to_download}'...")

try:
    # Use the files.download() function to initiate the download
    files.download(file_to_download)
    print(f"'{file_to_download}' download initiated successfully.")
    print("Check your browser's download bar or designated downloads folder.")
except FileNotFoundError:
    print(f"Error: The file '{file_to_download}' was not found in your Colab environment.")
    print("Please ensure you have run the previous preprocessing and deduplication steps successfully.")
except Exception as e:
    print(f"An error occurred during download: {e}")



Attempting to download '/content/NASA_Exoplanet_Archive_Cleaned.csv'...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'/content/NASA_Exoplanet_Archive_Cleaned.csv' download initiated successfully.
Check your browser's download bar or designated downloads folder.
