In [3]:
# from google.colab import files
# uploaded = files.upload()

# ===============================
# Assignment 2: Before vs After Cleaning Report
# ===============================

import pandas as pd

# Load dataset
df = pd.read_csv("creditcard.csv")

# -------------------------
# BEFORE CLEANING REPORT
# -------------------------
print("📌 BEFORE CLEANING REPORT")
print("Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDescriptive Statistics:\n", df.describe(include='all'))
print("="*80)

# -------------------------
# CLEANING STEPS
# -------------------------

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Handle missing values (if any, this dataset usually has none)
df = df.fillna(0)   # if no missing values, this does nothing safely

# 3. Outlier treatment on "Amount" column using IQR
Q1 = df['Amount'].quantile(0.25)
Q3 = df['Amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Amount'] < (Q1 - 1.5 * IQR)) | (df['Amount'] > (Q3 + 1.5 * IQR)))]

# -------------------------
# AFTER CLEANING REPORT
# -------------------------
print("📌 AFTER CLEANING REPORT")
print("Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDescriptive Statistics:\n", df.describe(include='all'))
print("="*80)

# -------------------------
# SAVE CLEANED DATASET
# -------------------------
df.to_csv("creditcard_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as 'creditcard_cleaned.csv'")




📌 BEFORE CLEANING REPORT
Shape: (284807, 31)

Missing Values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Descriptive Statistics:
                 Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-