# PRE-PROCESSING PIPELINE

In [1]:
import pandas as pd
import shutil
import os
from sklearn.preprocessing import RobustScaler
from IPython.display import display 

  from pandas.core import (


##  DATA LOADING AND INITIAL INSPECTION - CSV FROM KEPLER MISSION DATA

In [2]:
# Load the CS file
df = pd.read_csv("kepler.csv",comment="#")
# Display the first few rows of the DataFrame
display(df)

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.000,0,...,0.200,0.160,0.200,0.170,0.080,0.130,0.310,0.170,0.320,0.160
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,0,...,0.000,0.480,0.390,0.360,0.490,0.340,0.120,0.730,0.500,0.450
2,3,10811496,K00753.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.000,0,...,-0.034,0.070,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.000,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.000,0,...,-0.090,0.180,0.100,0.140,0.070,0.180,0.020,0.160,0.070,0.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560,10090151,K07985.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.000,0,...,-1.757,0.068,2.763,0.074,2.344,0.072,-1.756,0.068,2.929,0.072
9560,9561,10128825,K07986.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.497,0,...,-0.250,0.490,0.780,0.460,0.500,0.400,-0.180,0.470,0.530,0.470
9561,9562,10147276,K07987.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.021,0,...,-3.650,0.260,5.000,0.220,3.380,0.160,-3.890,0.260,5.160,0.220
9562,9563,10155286,K07988.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.092,0,...,1.320,0.670,1.690,0.530,1.450,0.110,1.370,0.660,2.000,0.460


In [3]:
# Show the count of missing (null) values for each column
df.isnull().sum() 

# Identify columns with the 'object' data type (typically strings)
object_typer_columns = df.select_dtypes(include="object").columns

## FEATURE SELECTION AND CLEANING

In [4]:
# Create a list of 'object' columns to drop, keeping 'koi_disposition' (the target)
columns_to_drop = [col for col in object_typer_columns if col != "koi_disposition"]

# Drop the identified non-target object columns from the DataFrame
df = df.drop(columns=columns_to_drop)

print("Removed columns:", columns_to_drop)
print("Remaining columns:", df.columns)

# Drop any columns where ALL values are NaN (not explicitly used here, but good practice)
df = df.dropna(how='all',axis=1) 

# Impute missing values (NaN) in numerical columns with the mean of that column
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].mean())
# Verify that all missing values have been handled
df.isnull().sum() 

# Drop identification columns that are not useful for training the model
df = df.drop(columns=["rowid", "kepid"]) 


Removed columns: ['kepoi_name', 'kepler_name', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_disp_prov', 'koi_comment', 'koi_fittype', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname', 'koi_quarters', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_sparprov']
Remaining columns: Index(['rowid', 'kepid', 'koi_disposition', 'koi_score', 'koi_fpflag_nt',
       'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period',
       'koi_period_err1',
       ...
       'koi_dicco_mdec', 'koi_dicco_mdec_err', 'koi_dicco_msky',
       'koi_dicco_msky_err', 'koi_dikco_mra', 'koi_dikco_mra_err',
       'koi_dikco_mdec', 'koi_dikco_mdec_err', 'koi_dikco_msky',
       'koi_dikco_msky_err'],
      dtype='object', length=125)


## SCALING AND NORMALIZATION (RobustScaler) 

In [5]:
# Separate features (X) and target (y)
X = df.drop(columns=["koi_disposition"]) 
y = df["koi_disposition"] 

# Initialize the RobustScaler (good for handling outliers)
scaler = RobustScaler()

# Fit the scaler to the features and transform them
X_scaled = scaler.fit_transform(X)
# Convert the scaled NumPy array back into a Pandas DataFrame, preserving column names
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Create a new DataFrame for normalized data
df_normalizado=X_scaled
# Reattach the original target column 'koi_disposition' to the normalized features
df_normalizado["koi_disposition"]=y

# Display the head of the normalized DataFrame (optional)
df_normalizado.head()

# Display descriptive statistics of the original DataFrame (optional)
df.describe()


Unnamed: 0,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
count,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,...,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0
mean,0.480829,0.208595,0.232748,0.197512,0.120033,75.671358,0.002148,-0.002148,166.183251,0.009933,...,-0.04542,0.446229,1.866561,0.48993,-0.024244,0.425337,-0.076749,0.436684,1.812566,0.47606
std,0.437658,4.76729,0.422605,0.398142,0.325018,1334.744046,0.008038,0.008038,67.91896,0.022542,...,2.491654,0.551795,2.893625,0.625831,2.310197,0.583929,2.476481,0.550362,2.896008,0.627991
min,0.0,0.0,0.0,0.0,0.0,0.241843,0.0,-0.1725,120.515914,9e-06,...,-75.9,0.067,0.0,0.067,-27.8,0.067,-76.6,0.067,0.0,0.067
25%,0.0,0.0,0.0,0.0,0.0,2.733684,6e-06,-0.000422,132.761718,0.001338,...,-0.31,0.11,0.19,0.11,-0.27,0.093,-0.33475,0.097,0.22775,0.1
50%,0.480829,0.0,0.0,0.0,0.0,9.752831,4.2e-05,-4.2e-05,137.224595,0.00458,...,-0.01,0.31,0.72,0.35,-0.024,0.28,-0.043,0.3,0.68,0.33
75%,0.995,0.0,0.0,0.0,0.0,40.715178,0.000422,-6e-06,170.694603,0.01,...,0.26,0.58,1.97,0.65,0.2475,0.57,0.26,0.57,1.812566,0.63
max,1.0,465.0,1.0,1.0,1.0,129995.7784,0.1725,0.0,1472.522306,0.569,...,27.5,22.0,88.6,32.0,46.57,33.0,34.0,22.0,89.6,32.0


## TARGET ENCODING

In [6]:
# Define the mapping for converting string labels to numerical labels
map_disposition = {
    "FALSE POSITIVE": 0,
    "CANDIDATE": 1,
    "CONFIRMED": 2
}
# Apply the mapping to the target column
df["koi_disposition"] = df["koi_disposition"].map(map_disposition)

# Display the final preprocessed DataFrame (optional)
display(df)

Unnamed: 0,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,2,1.000,0,0,0,0,9.488036,2.775000e-05,-2.775000e-05,170.538750,...,0.200,0.160,0.200,0.170,0.080,0.130,0.310,0.170,0.320,0.160
1,2,0.969,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,...,0.000,0.480,0.390,0.360,0.490,0.340,0.120,0.730,0.500,0.450
2,1,0.000,0,0,0,0,19.899140,1.494000e-05,-1.494000e-05,175.850252,...,-0.034,0.070,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,0,0.000,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,2,1.000,0,0,0,0,2.525592,3.761000e-06,-3.761000e-06,171.595550,...,-0.090,0.180,0.100,0.140,0.070,0.180,0.020,0.160,0.070,0.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,0,0.000,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,...,-1.757,0.068,2.763,0.074,2.344,0.072,-1.756,0.068,2.929,0.072
9560,1,0.497,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,...,-0.250,0.490,0.780,0.460,0.500,0.400,-0.180,0.470,0.530,0.470
9561,0,0.021,0,0,1,0,0.681402,2.434000e-06,-2.434000e-06,132.181750,...,-3.650,0.260,5.000,0.220,3.380,0.160,-3.890,0.260,5.160,0.220
9562,1,0.092,0,0,0,0,333.486169,4.235000e-03,-4.235000e-03,153.615010,...,1.320,0.670,1.690,0.530,1.450,0.110,1.370,0.660,2.000,0.460


## FILE EXPORT AND MOVEMENT

In [7]:

# 1. EXPORT THE DATAFRAME AS CSV

# Export the final processed DataFrame to a CSV file in the current directory
df.to_csv('normalized_data.csv', index=False)

# 2. DEFINE FILE AND DIRECTORY NAMES

# The file we created and need to move (note the .csv extension)
file_to_move = "normalized_data.csv" 

# The target subdirectory name
target_directory = "processed" 

# 3. CREATE DIRECTORY IF IT DOES NOT EXIST
if not os.path.exists(target_directory):
    
    # os.makedirs creates all intermediate directories if they don't exist
    os.makedirs(target_directory)
    print(f"Directory '{target_directory}' created.")

# 4. DEFINE THE FULL DESTINATION PATH
# Joins the directory name and the file name to get the final location
destination_path = os.path.join(target_directory, file_to_move)

# 5. MOVE THE FILE
try:
    # shutil.move is used to move a file or directory
    shutil.move(file_to_move, destination_path)
    print(f"File '{file_to_move}' successfully moved to '{target_directory}/'.")
    
except FileNotFoundError:
    # Catches error if the source file does not exist
    print(f"ERROR: The file '{file_to_move}' was not found in the current directory.")
except Exception as e:
    # Catches any other error during the move operation
    print(f"An error occurred while moving the file: {e}")

Directory 'processed' created.
File 'normalized_data.csv' successfully moved to 'processed/'.
