In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import miceforest as mf
import sklearn
from sklearn.model_selection import train_test_split


In [14]:
data = pd.read_csv('train.csv')

X = data.drop(columns=['hazardous'])
Y = data['hazardous']

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    Y, 
    test_size=0.2, 
    random_state=42,
    stratify=Y
)

print(X.shape,Y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4034, 23) (4034,)
(3227, 23) (3227,)
(807, 23) (807,)


In [15]:
def print_missing(data):
    missing_counts_4 = data.isnull().sum()
    missing_percentage_4 = (data.isnull().mean() * 100)

    # Combine into a summary table for easy viewing
    missing_summary_4 = pd.DataFrame({
        'Missing Count': missing_counts_4,
        'Percentage (%)': missing_percentage_4,
        'dtype': data.dtypes
    }).sort_values(by='Percentage (%)', ascending=False)

    print(missing_summary_4)

    return 0

In [16]:
columns_to_drop_ = ['miss_dist_miles', 'miss_dist_kilometers', 'miss_dist_lunar', 'relative_velocity_km_per_hr', 'relative_velocity_km_per_sec']
X_train =  X_train.drop(columns = columns_to_drop_)
X_test =  X_test.drop(columns = columns_to_drop_)

missing_counts_1 = X_train.isnull().sum()
missing_percentage_1 = (X_train.isnull().mean() * 100)

# Combine into a summary table for easy viewing
missing_summary = pd.DataFrame({
    'Missing Count': missing_counts_1,
    'Percentage (%)': missing_percentage_1,
    'dtype': X_train.dtypes
}).sort_values(by='Percentage (%)', ascending=False)

print(missing_summary)

                             Missing Count  Percentage (%)    dtype
orbit_uncertainity                    1261       39.076542   object
jupiter_tisserand_invariant           1236       38.301828  float64
epoch_osculation                      1102       34.149365  float64
perihelion_time                       1090       33.777502  float64
approach_month                        1090       33.777502  float64
mean_motion                           1083       33.560583  float64
epoch_date_close_approach              885       27.424853  float64
semi_major_axis                        843       26.123334  float64
perihelion_arg                         803       24.883793  float64
asc_node_longitude                     768       23.799194  float64
mean_anomaly                           643       19.925628  float64
miles_per_hour                         598       18.531143  float64
approach_year                          589       18.252247  float64
aphelion_dist                          586      

In [17]:
AU = 1.496e11
G = 6.67430e-11
M_SUN = 1.989e30
MPH_to_KPS = 2236.94


def imputations(input_df):
    # 1. Properly copy the dataframe
    da = input_df.copy()

    # 2. Fill categorical missing values
    da['orbit_uncertainity'] = da['orbit_uncertainity'].fillna('Unknown')

    # --- Kepler's 3rd Law Imputations ---
    
    # Fill mean_motion
    m1 = da['mean_motion'].isna() & da['semi_major_axis'].notna()
    if m1.any():
        a_m = da.loc[m1, 'semi_major_axis'] * AU
        n_rad_per_sec = np.sqrt(G * M_SUN / a_m**3)
        da.loc[m1, 'mean_motion'] = n_rad_per_sec * (180/np.pi) * 86400
        print(f"Filled {m1.sum()} mean_motion from semi_major_axis")
    
    # Fill semi_major_axis
    m2 = da['semi_major_axis'].isna() & da['mean_motion'].notna()
    if m2.any():
        n_rad_per_sec = da.loc[m2, 'mean_motion'] * (np.pi/180) / 86400
        a_m = (G * M_SUN / n_rad_per_sec**2)**(1/3)
        da.loc[m2, 'semi_major_axis'] = a_m / AU
        print(f"Filled {m2.sum()} semi_major_axis from mean_motion")

    # --- Eccentricity and Aphelion ---

    m3 = da['aphelion_dist'].notna() & da['semi_major_axis'].notna()
    if m3.any():
        # Ensure column exists before calculation
        if 'eccentricity' not in da.columns:
            da['eccentricity'] = np.nan
        da.loc[m3, 'eccentricity'] = (da.loc[m3, 'aphelion_dist'] / 
                                      da.loc[m3, 'semi_major_axis']) - 1
        print(f"Calculated {m3.sum()} eccentricity values")
    
    if 'eccentricity' in da.columns:
        m4 = (da['aphelion_dist'].isna() & 
              da['semi_major_axis'].notna() & 
              da['eccentricity'].notna())
        if m4.any():
            da.loc[m4, 'aphelion_dist'] = (da.loc[m4, 'semi_major_axis'] * 
                                          (1 + da.loc[m4, 'eccentricity']))
            print(f"Filled {m4.sum()} aphelion_dist")
    
    cols_to_check = ['mean_motion', 'semi_major_axis', 'aphelion_dist', 'eccentricity']
    
    for col in cols_to_check:
        if col in da.columns:
            neg_mask = da[col] < 0
            count = neg_mask.sum()
            if count > 0:
                da.loc[neg_mask, col] = np.nan
                print(f"Set {count} negative values to NaN in {col}")

    return da

z = imputations(X_train)

print_missing(z)

Filled 792 mean_motion from semi_major_axis
Filled 552 semi_major_axis from mean_motion
Calculated 2402 eccentricity values
                             Missing Count  Percentage (%)    dtype
jupiter_tisserand_invariant           1236       38.301828  float64
epoch_osculation                      1102       34.149365  float64
perihelion_time                       1090       33.777502  float64
approach_month                        1090       33.777502  float64
epoch_date_close_approach              885       27.424853  float64
eccentricity                           825       25.565541  float64
perihelion_arg                         803       24.883793  float64
asc_node_longitude                     768       23.799194  float64
mean_anomaly                           643       19.925628  float64
miles_per_hour                         598       18.531143  float64
approach_year                          589       18.252247  float64
aphelion_dist                          586       18.159281  

0

In [18]:
def fill_temporal_features_v2(da):
    da = da.copy()
    import pandas as pd
    
    # Strategy 1: Extract from timestamp wherever timestamp exists
    has_ts = da['epoch_date_close_approach'].notna()
    
    if has_ts.sum() > 0:
        # Convert all timestamps to datetime
        all_dates = pd.to_datetime(da['epoch_date_close_approach'], 
                                   unit='ms', errors='coerce')
        
        # Fill year
        da.loc[has_ts & da['approach_year'].isna(), 'approach_year'] = \
            all_dates[has_ts & da['approach_year'].isna()].dt.year
        
        # Fill month  
        da.loc[has_ts & da['approach_month'].isna(), 'approach_month'] = \
            all_dates[has_ts & da['approach_month'].isna()].dt.month
        
        # Fill day
        da.loc[has_ts & da['approach_day'].isna(), 'approach_day'] = \
            all_dates[has_ts & da['approach_day'].isna()].dt.day
        
        print(f"Extracted components from {has_ts.sum()} timestamps")
    
    # Strategy 2: Create timestamp from components
    has_components = (da['approach_year'].notna() & 
                     da['approach_month'].notna() & 
                     da['approach_day'].notna())
    needs_ts = da['epoch_date_close_approach'].isna() & has_components
    
    if needs_ts.sum() > 0:
        # Build datetime
        temp_df = pd.DataFrame({
            'year': da.loc[needs_ts, 'approach_year'].astype(int),
            'month': da.loc[needs_ts, 'approach_month'].astype(int),
            'day': da.loc[needs_ts, 'approach_day'].astype(int)
        })
        
        new_dates = pd.to_datetime(temp_df, errors='coerce')
        da.loc[needs_ts, 'epoch_date_close_approach'] = new_dates.astype(int) / 10**6
        
        print(f"Created {needs_ts.sum()} timestamps from components")
    
    return da

z = fill_temporal_features_v2(z)
print_missing(z)

Extracted components from 2342 timestamps
Created 415 timestamps from components
                             Missing Count  Percentage (%)    dtype
jupiter_tisserand_invariant           1236       38.301828  float64
epoch_osculation                      1102       34.149365  float64
perihelion_time                       1090       33.777502  float64
eccentricity                           825       25.565541  float64
perihelion_arg                         803       24.883793  float64
asc_node_longitude                     768       23.799194  float64
mean_anomaly                           643       19.925628  float64
miles_per_hour                         598       18.531143  float64
aphelion_dist                          586       18.159281  float64
epoch_date_close_approach              470       14.564611  float64
miss_dist_astronomical                 433       13.418035  float64
orbital_period                         381       11.806632   object
approach_month                     

0

In [19]:
def simple_temporal_imputation(da):
    """
    Fill remaining temporal features with simple statistical imputation.
    
    Strategy:
    - approach_month: mode (most common month)
    - approach_day: median (middle of month â‰ˆ 15)
    - approach_year: median (central tendency)
    - epoch_osculation: median (reference date)
    """
    da = da.copy()
    
    # Fill approach_month with mode
    if da['approach_month'].isna().sum() > 0:
        month_mode = da['approach_month'].mode()[0]
        count_before = da['approach_month'].isna().sum()
        da['approach_month'] = da['approach_month'].fillna(month_mode)
        print(f"âœ“ Filled {count_before} approach_month values with mode: {int(month_mode)}")
    
    # Fill approach_day with median
    if da['approach_day'].isna().sum() > 0:
        day_median = da['approach_day'].median()
        count_before = da['approach_day'].isna().sum()
        da['approach_day'] = da['approach_day'].fillna(day_median)
        print(f"âœ“ Filled {count_before} approach_day values with median: {int(day_median)}")
    
    # Fill approach_year with median
    if da['approach_year'].isna().sum() > 0:
        year_median = da['approach_year'].median()
        count_before = da['approach_year'].isna().sum()
        da['approach_year'] = da['approach_year'].fillna(year_median)
        print(f"âœ“ Filled {count_before} approach_year values with median: {int(year_median)}")
    
    # Fill epoch_osculation with median
    if da['epoch_osculation'].isna().sum() > 0:
        osculation_median = da['epoch_osculation'].median()
        count_before = da['epoch_osculation'].isna().sum()
        da['epoch_osculation'] = da['epoch_osculation'].fillna(osculation_median)
        print(f"âœ“ Filled {count_before} epoch_osculation values with median: {osculation_median:.2f}")
    
    print(f"\nâœ… Temporal features imputation complete!")
    
    return da
def sync_timestamps_final(da):
    """
    Create timestamps for any remaining rows that have complete components.
    Call this AFTER simple_temporal_imputation.
    """
    da = da.copy()
    import pandas as pd
    
    # Find rows with components but missing timestamp
    has_components = (da['approach_year'].notna() & 
                     da['approach_month'].notna() & 
                     da['approach_day'].notna())
    needs_ts = da['epoch_date_close_approach'].isna() & has_components
    
    if needs_ts.sum() > 0:
        try:
            temp_df = pd.DataFrame({
                'year': da.loc[needs_ts, 'approach_year'].astype(int),
                'month': da.loc[needs_ts, 'approach_month'].astype(int),
                'day': da.loc[needs_ts, 'approach_day'].astype(int)
            })
            
            new_dates = pd.to_datetime(temp_df, errors='coerce')
            da.loc[needs_ts, 'epoch_date_close_approach'] = new_dates.astype(int) / 10**6
            
            print(f"âœ“ Created {needs_ts.sum()} timestamps from imputed components")
        except Exception as e:
            print(f"âš  Warning: Could not create some timestamps - {e}")
    
    return da

# After your cross-filling
z_v3 = simple_temporal_imputation(z)

# Verify all temporal features are filled
temporal_cols = ['approach_month', 'approach_day', 'approach_year', 
                 'epoch_osculation', 'epoch_date_close_approach']
print("\nRemaining missing in temporal features:")
print(z_v3[temporal_cols].isna().sum())

âœ“ Filled 316 approach_month values with mode: 2
âœ“ Filled 120 approach_day values with median: 15
âœ“ Filled 153 approach_year values with median: 2008
âœ“ Filled 1102 epoch_osculation values with median: 2458000.50

âœ… Temporal features imputation complete!

Remaining missing in temporal features:
approach_month                 0
approach_day                   0
approach_year                  0
epoch_osculation               0
epoch_date_close_approach    470
dtype: int64


In [20]:
def fill_remaining_timestamps(da):
    """
    Fill any remaining epoch_date_close_approach with median.
    Use this as a fallback if sync_timestamps_final doesn't catch everything.
    """
    da = da.copy()
    
    if da['epoch_date_close_approach'].isna().sum() > 0:
        ts_median = da['epoch_date_close_approach'].median()
        count_before = da['epoch_date_close_approach'].isna().sum()
        da['epoch_date_close_approach'] = da['epoch_date_close_approach'].fillna(ts_median)
        print(f"âœ“ Filled {count_before} epoch_date_close_approach values with median: {ts_median:.2f}")
    
    return da

# Use it:
z_v4 = sync_timestamps_final(z_v3)  # Try creating from components first
z_v4 = fill_remaining_timestamps(z_v4)  # Fill any stragglers with median

# Final verification
print("\n" + "="*60)
print("ðŸŽ¯ FINAL TEMPORAL FEATURES STATUS")
print("="*60)
print(z_v4[['epoch_date_close_approach', 'approach_month', 
            'approach_day', 'approach_year', 'epoch_osculation']].isna().sum())
print("\nâœ… Should be all zeros!")

âœ“ Created 470 timestamps from imputed components

ðŸŽ¯ FINAL TEMPORAL FEATURES STATUS
epoch_date_close_approach    0
approach_month               0
approach_day                 0
approach_year                0
epoch_osculation             0
dtype: int64

âœ… Should be all zeros!
