In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-data/student_data.csv


# **AI Data Cleaning Agent**

In [2]:
!pip install pandas scikit-learn --quiet

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")



In [3]:
class DataLoaderAgent:
    def load(self, file_path):
        print(f"üìÇ Loading dataset: {file_path}")
        return pd.read_csv(file_path)


In [4]:
class MissingValueAgent:
    def clean(self, df):
        print("üßπ Handling missing values...")
        imputer = SimpleImputer(strategy='most_frequent')
        df[df.columns] = imputer.fit_transform(df)
        return df


In [5]:
class DuplicateAgent:
    def clean(self, df):
        duplicates = df.duplicated().sum()
        print(f"üîÅ Removing {duplicates} duplicate rows...")
        return df.drop_duplicates(), duplicates


In [6]:
class OutlierAgent:
    def detect(self, df):
        print("üìä Detecting outliers (IQR Method)...")
        outliers = {}
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
            outliers[col] = mask.sum()

        return outliers


In [7]:
class SummaryAgent:
    def generate(self, df, missing_before, duplicates_removed, outliers):
        print("üìÑ Generating summary report...")
        return {
            "Rows after cleaning": len(df),
            "Missing values (before)": missing_before.to_dict(),
            "Duplicates removed": duplicates_removed,
            "Outliers detected": outliers
        }


In [8]:
class AIDataCleaningAgent:
    def __init__(self):
        self.loader = DataLoaderAgent()
        self.missing_agent = MissingValueAgent()
        self.duplicate_agent = DuplicateAgent()
        self.outlier_agent = OutlierAgent()
        self.summary_agent = SummaryAgent()

    def run(self, file_path):
        df = self.loader.load(file_path)

        missing_before = df.isnull().sum()

        df = self.missing_agent.clean(df)
        df, duplicates_removed = self.duplicate_agent.clean(df)
        outliers = self.outlier_agent.detect(df)
        summary = self.summary_agent.generate(df, missing_before, duplicates_removed, outliers)

        # ‚úÖ Fix: save in Kaggle working directory
        import os
        file_name = os.path.basename(file_path)  # get only the file name
        output_path = os.path.join("/kaggle/working", "cleaned_" + file_name)
        df.to_csv(output_path, index=False)
        print(f"\n‚úÖ Cleaned file saved as {output_path}")

        return df, summary



In [9]:
# Create the agent
agent = AIDataCleaningAgent()

# Full Kaggle path to your CSV
input_file = "/kaggle/input/student-data/student_data.csv"

# Run the agent (this will also save the cleaned CSV automatically)
df_cleaned, summary = agent.run(input_file)

# Show the cleaned DataFrame and summary


print("\n--- Summary Report ---")
print(summary)

print("\n--- Cleaned DataFrame ---")
print(df_cleaned)




üìÇ Loading dataset: /kaggle/input/student-data/student_data.csv
üßπ Handling missing values...
üîÅ Removing 0 duplicate rows...
üìä Detecting outliers (IQR Method)...
üìÑ Generating summary report...

‚úÖ Cleaned file saved as /kaggle/working/cleaned_student_data.csv

--- Summary Report ---
{'Rows after cleaning': 6, 'Missing values (before)': {'StudentID': 0, 'Name': 0, 'Age': 1, 'Score': 1, 'Grade': 1}, 'Duplicates removed': 0, 'Outliers detected': {}}

--- Cleaned DataFrame ---
  StudentID     Name   Age Score Grade
0       101    Alice  22.0  85.0     A
1       102      Bob  22.0  78.0     B
2       103  Charlie  21.0  85.0     C
3       104    David  23.0  92.0     A
4       105      Eve  22.0  88.0     A
5       106    Alice  22.0  85.0     A


## Project Summary: AI Data Cleaning Agent

This project demonstrates an AI-powered data cleaning agent designed to automate the preprocessing of CSV datasets. The agent performs the following tasks:

Loads any CSV dataset into a pandas DataFrame.

Handles missing values automatically by filling or imputing appropriate values.

Removes duplicate rows to ensure data integrity.

Detects outliers using the IQR (Interquartile Range) method.

Generates a summary report detailing missing values, duplicates removed, and outliers detected.

Saves the cleaned dataset automatically to a CSV file, ready for further analysis or machine learning workflows.

This approach demonstrates the practical use of AI agents in data preprocessing, making datasets ready for analysis without manual intervention. The project is generalizable to any CSV dataset, showcasing an automated workflow that can be reused in real-world data projects.