In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [3]:
# File To Load
planet_data_to_load = Path("PS_2025.01.28_17.46.07.csv")

# Read Data
planetData = pd.read_csv(planet_data_to_load, skiprows=31)

# Display First Few Rows
planetData.head()

Unnamed: 0,pl_name,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,...,pl_radeerr2,pl_radelim,pl_bmasse,pl_bmasseerr1,pl_bmasseerr2,pl_bmasselim,sy_dist,sy_disterr1,sy_disterr2,releasedate
0,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,0.32,-0.32,...,,,6165.6,476.7,-476.7,0.0,93.1846,1.9238,-1.9238,2014-05-14
1,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,0,,,,...,,,5434.7,540.3,-413.2,0.0,93.1846,1.9238,-1.9238,2014-07-23
2,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,0,323.21,0.06,-0.05,...,,,4914.89849,39.09289,-39.72855,0.0,93.1846,1.9238,-1.9238,2023-09-19
3,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,3.2,-3.2,...,,,4684.8142,794.575,-794.575,0.0,125.321,1.9765,-1.9765,2018-09-06
4,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.22,3.25,-3.25,...,,,3337.07,785.01,-785.01,0.0,125.321,1.9765,-1.9765,2014-05-14


In [10]:
# Drops columns we don't need
planetSort = planetData.drop(['pl_orbpererr1', 'pl_orbpererr2', 'pl_radeerr1', 'pl_radeerr2', 'pl_bmasseerr1', 'pl_bmasseerr2', 'sy_disterr1', 'sy_disterr2', 'pl_orbperlim', 'pl_radelim', 'pl_bmasselim', 'pl_controv_flag'], axis=1)
planetSort.head()

Unnamed: 0,pl_name,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_orbper,pl_rade,pl_bmasse,sy_dist,releasedate
0,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,326.03,,6165.6,93.1846,2014-05-14
1,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,,,5434.7,93.1846,2014-07-23
2,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,323.21,,4914.89849,93.1846,2023-09-19
3,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.21997,,4684.8142,125.321,2018-09-06
4,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.22,,3337.07,125.321,2014-05-14


In [12]:
# Renames columns to make them easier to understand
planetSort.columns = ['Planet Name', 'Number of Stars', 'Number of Planets', 'Discovery Method', 'Discovery Year', 'Discovery Facility', 'Orbital Period (days)', 'Planet Radius (Earth Radius)', 'Planet Mass', 'Distance From Earth', 'Release Date']
planetSort.head()

Unnamed: 0,Planet Name,Number of Stars,Number of Planets,Discovery Method,Discovery Year,Discovery Facility,Orbital Period (days),Planet Radius (Earth Radius),Planet Mass,Distance From Earth,Release Date
0,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,326.03,,6165.6,93.1846,2014-05-14
1,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,,,5434.7,93.1846,2014-07-23
2,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,323.21,,4914.89849,93.1846,2023-09-19
3,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.21997,,4684.8142,125.321,2018-09-06
4,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.22,,3337.07,125.321,2014-05-14


In [33]:
# Converts the last column to 'Datetime' data type, then sorts the rows so that no duplicate entries appear
planetSort['Release Date'] = pd.to_datetime(planetSort['Release Date'], format='mixed')
recentDates = planetSort.groupby('Planet Name')['Release Date'].max()
planetCleanedFinal = planetSort.merge(recentDates, on=['Planet Name', 'Release Date'])
planetCleanedFinal.drop_duplicates(subset='Planet Name', inplace=True, keep="last")
planetCleanedFinal.head()

Unnamed: 0,Planet Name,Number of Stars,Number of Planets,Discovery Method,Discovery Year,Discovery Facility,Orbital Period (days),Planet Radius (Earth Radius),Planet Mass,Distance From Earth,Release Date
0,11 Com b,2,1,Radial Velocity,2007,Xinglong Station,323.21,,4914.89849,93.1846,2023-09-19
1,11 UMi b,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,516.21997,,4684.8142,125.321,2018-09-06
2,14 And b,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,186.76,,1131.1513,75.4392,2023-09-19
3,14 Her b,1,2,Radial Velocity,2002,W. M. Keck Observatory,1765.0389,,2559.47216,17.9323,2023-06-12
4,16 Cyg B b,3,1,Radial Velocity,1996,Multiple Observatories,799.45,,556.83537,21.1397,2021-09-20
5,17 Sco b,1,1,Radial Velocity,2020,Lick Observatory,578.38,,1373.01872,124.953,2021-10-25
6,18 Del b,2,1,Radial Velocity,2008,Okayama Astrophysical Observatory,982.85,,2926.24614,76.222,2023-09-19
9,1RXS J160929.1-210524 b,1,1,Imaging,2008,Gemini Observatory,,,3000.0,139.135,2015-04-01
10,24 Boo b,1,1,Radial Velocity,2018,Okayama Astrophysical Observatory,30.33,,280.64248,95.9863,2023-09-19
11,24 Sex b,1,2,Radial Velocity,2010,Lick Observatory,452.8,,632.46,72.0691,2014-05-14


In [37]:
planetCleanedFinal.to_csv('exoplanetDataClean.csv', index=False)