In [25]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
project_folder = '/content/drive/MyDrive/AutoClean'


In [58]:
import os

project_folder = '/content/drive/MyDrive/AutoClean'

# Create necessary directories within Google Drive if not already present
os.makedirs(f'{project_folder}/input', exist_ok=True)
os.makedirs(f'{project_folder}/output', exist_ok=True)
os.makedirs(f'{project_folder}/src', exist_ok=True)
os.makedirs(f'{project_folder}/logs', exist_ok=True)


In [65]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

for file_name in uploaded.keys():
    file_path = f"/content/{file_name}"

    data = pd.read_csv(file_path)

    print(f"File uploaded successfully: {file_name}")


Saving synthetic_data_large_test.csv to synthetic_data_large_test (1).csv
File uploaded successfully: synthetic_data_large_test (1).csv


In [66]:
import pandas as pd
import numpy as np
data = pd.read_csv(file_name)

data.head() #printing the first few lines to check if the data has been uploaded correctly

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,Name,Category,Target
0,0.496714,-0.138264,,1.52303,-0.234153,-0.234137,,0.767435,-0.469474,0.54256,,-0.46573,Eve,A,1
1,0.241962,-1.91328,-1.724918,,-1.012831,0.314247,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748,Alice,C,1
2,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,-0.601707,1.852278,-0.013497,-1.057711,0.822545,-1.220844,Eve,C,1
3,,-1.95967,-1.328186,0.196861,0.738467,0.171368,-0.115648,-0.301104,-1.478522,,-0.460639,1.057122,Charlie,A,0
4,0.343618,-1.76304,0.324084,-0.385082,-0.676922,0.611676,1.031,,-0.839218,-0.309212,0.331263,,David,A,1


In [67]:
missing_values = data.isnull().sum()
print(missing_values)

missing_percentage = data.isnull().mean() * 100 #the percentage of the file that has missing values

# creating a threshold for missing values to chose whether to drop or replace
threshold = int(input("enter the threshold based on your requirements: "))  # You can adjust this threshold based on your criteria

# iterating through columns to identify and handle the missing data
for col in data.columns:
  if missing_percentage[col] > threshold:
    print(f"Column '{col}' has more than {threshold}% missing values. Dropping rows...")
    data = data.dropna(subset=[col])  # drops rows where that column has NaN values
  else:
    #applying condition to handle numeric and non-numeric values
    if data[col].dtype in ['float64', 'int64']:  # if the column is numeric we will insert the mean
        print(f"Column '{col}' has less than {threshold}% missing values. Filling with mean...")
        data[col] = data[col].fillna(data[col].mean())
    else:
        print(f"Column '{col}' is non-numeric. Dropping rows with missing values...")
        data = data.dropna(subset=[col])  #drop rows that have non-numeric values for their columns

# present cleaned data
print("\nCleaned DataFrame:")
print(data)


feature_0      997
feature_1     1016
feature_2     1006
feature_3     1017
feature_4     1000
feature_5     1021
feature_6     1021
feature_7      968
feature_8     1011
feature_9     1025
feature_10    1044
feature_11     979
Name             0
Category         0
Target           0
dtype: int64
enter the threshold based on your requirements: 10
Column 'feature_0' has less than 10% missing values. Filling with mean...
Column 'feature_1' has more than 10% missing values. Dropping rows...
Column 'feature_2' has more than 10% missing values. Dropping rows...
Column 'feature_3' has more than 10% missing values. Dropping rows...
Column 'feature_4' has less than 10% missing values. Filling with mean...
Column 'feature_5' has more than 10% missing values. Dropping rows...
Column 'feature_6' has more than 10% missing values. Dropping rows...
Column 'feature_7' has less than 10% missing values. Filling with mean...
Column 'feature_8' has more than 10% missing values. Dropping rows...
Column 'f

In [68]:
before_dropping = data.shape[0]
data = data.drop_duplicates()
after_dropping = data.shape[0]
duplicates_removed = before_dropping - after_dropping

print(f"Removed {duplicates_removed} duplicate rows.")


Removed 0 duplicate rows.


In [69]:
# catch only numerical outliers
numeric_data = data.select_dtypes(include=np.number)

Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (numeric_data < lower_bound) | (numeric_data > upper_bound) #returns boolean dataframe

print("Outliers detected in the following columns:")
print(outliers.any())

data_no_outliers = data.copy()
for col in numeric_data.columns:
    data_no_outliers[col] = np.where(
        data_no_outliers[col] < lower_bound[col],
        lower_bound[col],
        data_no_outliers[col]
    )
    data_no_outliers[col] = np.where(
        data_no_outliers[col] > upper_bound[col],
        upper_bound[col],
        data_no_outliers[col]
    )

print("Outliers capped successfully.")


Outliers detected in the following columns:
feature_0      True
feature_1      True
feature_2      True
feature_3      True
feature_4      True
feature_5      True
feature_6      True
feature_7      True
feature_8      True
feature_9      True
feature_10     True
feature_11     True
Target        False
dtype: bool
Outliers capped successfully.


In [70]:
# Standardising numerical data, mean = 0; std deviation = 1
from sklearn.preprocessing import StandardScaler
import os

base_name = os.path.splitext(os.path.basename(file_path))[0]

standardised_file_name = f"{base_name}_standardised.csv"
output_folder = "/content/drive/My Drive/AutoClean/output"
output_standardised_path = f"{output_folder}/{standardised_file_name}"

scaler = StandardScaler()
data_standardised = data.copy()
data_standardised[data.select_dtypes(include=np.number).columns] = scaler.fit_transform(data.select_dtypes(include=np.number))
print("Data standardised: mean = 0, std = 1")

data_standardised.to_csv(output_standardised_path, index=False)
print(f"Standardised file saved as: {standardised_file_name}")


Data standardised: mean = 0, std = 1
Standardised file saved as: synthetic_data_large_test (1)_standardised.csv


In [71]:
from sklearn.preprocessing import MinMaxScaler
import os

min_max_scaler = MinMaxScaler()

#Normalize only numeric columns
data_min_max_normalised = data.copy()
data_min_max_normalised[data.select_dtypes(include=np.number).columns] = min_max_scaler.fit_transform(
    data.select_dtypes(include=np.number)
)

base_name = os.path.splitext(os.path.basename(file_path))[0]
min_max_normalised_file_name = f"{base_name}_min_max_normalised.csv"
output_folder = "/content/drive/My Drive/AutoClean/output"
output_min_max_normalised_path = f"{output_folder}/{min_max_normalised_file_name}"

data_min_max_normalised.to_csv(output_min_max_normalised_path, index=False)
print(f"Min-Max Normalized file saved to: {output_min_max_normalised_path}")


Min-Max Normalized file saved to: /content/drive/My Drive/AutoClean/output/synthetic_data_large_test (1)_min_max_normalised.csv


In [73]:
import os

folder_name = "AutoClean"

folder_path = f"/content/drive/My Drive/{folder_name}"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created successfully.")
else:
    print(f"Folder '{folder_name}' already exists.")

cleaned_file_name = f"{base_name}_cleaned.csv"
output_path = os.path.join(folder_path, cleaned_file_name)

print(f"Saving cleaned file to: {output_path}")

data.to_csv(output_path, index=False)

if os.path.exists(output_path):
    print(f"File saved successfully: {output_path}")
else:
    print(f"Failed to save file at {output_path}")


Folder 'AutoClean' already exists.
Saving cleaned file to: /content/drive/My Drive/AutoClean/synthetic_data_large_test (1)_cleaned.csv
File saved successfully: /content/drive/My Drive/AutoClean/synthetic_data_large_test (1)_cleaned.csv


In [75]:
import datetime

# Create a timestamp for log file name
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file_name = f"log_{base_name}_{timestamp}.txt"
log_path = f"{log_folder}/{log_file_name}"

# Save some simple log information — you can add more if needed
with open(log_path, 'w') as log_file:
    log_file.write(f"Log for file: {base_name}\n")
    log_file.write(f"Timestamp: {timestamp}\n")
    log_file.write(f"Missing value handling: Done\n")
    log_file.write(f"Outlier detection: Done\n")
    log_file.write(f"Duplicate rows removed: {duplicates_removed}\n")
    log_file.write(f"File saved at: {output_path}\n")

print(f"Log saved to: {log_path}")


Log saved to: /content/drive/My Drive/AutoClean/logs/log_synthetic_data_large_test (1)_2025-04-20_01-14-50.txt
