<a href="https://colab.research.google.com/github/ashleenorville/ashleenorville/blob/main/Korea-ggumim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define file paths
input_file = "/content/Korea-ggumim.csv"
garbage_file = "/content/garbage.csv"
output_folder = "/content/outputs"
output_file = f"{output_folder}/cleaned_data.csv"

# Create an empty DataFrame for removed data
garbage = pd.DataFrame()


In [3]:
# Function to load data
def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully with shape {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        raise


In [4]:
# Function to handle missing values
def handle_missing_values(df):
    global garbage
    missing_data = df[df.isnull().any(axis=1)]
    garbage = pd.concat([garbage, missing_data])  # Store rows with missing values
    df.dropna(inplace=True)
    print(f"Missing values handled. Remaining rows: {df.shape[0]}")
    return df


In [5]:
# Function to remove duplicates
def remove_duplicates(df):
    global garbage
    duplicates = df[df.duplicated()]
    garbage = pd.concat([garbage, duplicates])  # Store duplicate rows
    df.drop_duplicates(inplace=True)
    print(f"Duplicates removed. Remaining rows: {df.shape[0]}")
    return df


In [6]:
# Function to convert data types
def convert_data_types(df):
    global garbage
    if 'date' in df.columns:
        try:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            invalid_dates = df[df['date'].isnull()]
            garbage = pd.concat([garbage, invalid_dates])  # Store invalid date rows
            df = df.dropna(subset=['date'])  # Remove rows with invalid dates
            print(f"Date conversion completed.")
        except Exception as e:
            print(f"Error while converting date column: {e}")
    return df


In [7]:
# Function to remove outliers
def remove_outliers(df):
    global garbage
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numerical_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        garbage = pd.concat([garbage, outliers])  # Store outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]  # Keep non-outliers
    print(f"Outliers removed. Remaining rows: {df.shape[0]}")
    return df


In [8]:
# Function to standardize data
def standardize_data(df):
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    print(f"Data standardized for numerical columns.")
    return df


In [9]:
# Function to handle invalid data
def handle_invalid_data(df):
    global garbage
    if 'category' in df.columns:
        valid_categories = ['A', 'B', 'C']  # Define valid categories
        invalid_categories = df[~df['category'].isin(valid_categories)]
        garbage = pd.concat([garbage, invalid_categories])  # Store invalid categories
        df = df[df['category'].isin(valid_categories)]  # Keep only valid categories
        print(f"Invalid data handled for 'category' column.")
    return df


In [12]:
# Function to save data
def save_data(df, output_filepath, garbage_filepath):
    global garbage
    # Ensure output folder exists
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)

    # Save the garbage data
    garbage.drop_duplicates(inplace=True)  # Avoid storing duplicates in garbage
    garbage.to_csv(garbage_filepath, index=False)
    print(f"Removed data saved to: {garbage_filepath}")

    # Save the cleaned data
    df.to_csv(output_filepath, index=False)
    print(f"Cleaned data saved to: {output_filepath}")


In [13]:
# Main function to execute all steps
def main():
    # Step 1: Load data
    df = load_data(input_file)

    # Step 2: Clean data
    df = handle_missing_values(df)
    df = remove_duplicates(df)
    df = convert_data_types(df)
    df = remove_outliers(df)
    df = standardize_data(df)
    df = handle_invalid_data(df)

    # Step 3: Save results
    save_data(df, output_file, garbage_file)

# Run the main function
if __name__ == "__main__":
    main()


  df = pd.read_csv(filepath)


Data loaded successfully with shape (1301500, 11)
Missing values handled. Remaining rows: 9
Duplicates removed. Remaining rows: 9
Outliers removed. Remaining rows: 8
Data standardized for numerical columns.
Removed data saved to: /content/garbage.csv
Cleaned data saved to: /content/outputs/cleaned_data.csv
