In [12]:
import pandas as pd

# Load the dataset
file_path = r"cleaned_customer_shopping_data.csv"
df = pd.read_csv(file_path)


In [14]:
# Display first few rows to inspect the dataset
print("Initial Dataset Preview:")
print(df.head())


Initial Dataset Preview:
  invoice_no customer_id  gender  age  category  quantity    price  \
0    I138884     C241288  Female   28  Clothing         5  1500.40   
1    I317333     C111565    Male   21     Shoes         3  1800.51   
2    I127801     C266599    Male   20  Clothing         1   300.08   
3    I173702     C988172  Female   66     Shoes         5  3000.85   
4    I337046     C189076  Female   53     Books         4    60.60   

  payment_method invoice_date   shopping_mall  
0    Credit Card   2022-08-05          Kanyon  
1     Debit Card   2021-12-12  Forum Istanbul  
2           Cash   2021-11-09       Metrocity  
3    Credit Card   2021-05-16    Metropol Avm  
4           Cash   2021-10-24          Kanyon  


In [16]:
# Convert 'invoice_date' to datetime format
df['invoice_date'] = pd.to_datetime(df['invoice_date'], errors='coerce', dayfirst=True)


In [18]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)


In [20]:
# Standardize text columns
df['gender'] = df['gender'].str.title().str.strip()
df['payment_method'] = df['payment_method'].str.title().str.strip()
df['shopping_mall'] = df['shopping_mall'].str.title().str.strip()


In [22]:
# Handle outliers: Removing extreme values in 'age', 'quantity', and 'price'
df = df[(df['age'].between(18, 80)) & (df['quantity'] > 0) & (df['price'] > 0)]

In [26]:
# Check if the dataset contains age values below 18 (non-adults)
age_analysis = df[df['age'] < 18]
# Display the results
if not age_analysis.empty:
    print(f"The dataset contains {age_analysis.shape[0]} records where age is below 18.")
    display(age_analysis.head())  # Display a sample of non-adult records
else:
    print("The dataset is adult-focused (no records with age below 18).")



The dataset is adult-focused (no records with age below 18).


In [28]:
df.to_csv("cleaned_customer_shopping_data.csv", index=False)
print("Cleaned dataset saved successfully in the current directory.")


Cleaned dataset saved successfully in the current directory.


In [30]:
# Display cleaned dataset summary
print("\nData Cleaning Completed!")
print("Total Rows After Cleaning:", len(df))
print("Missing Values After Cleaning:\n", df.isnull().sum())
print("Data Types After Cleaning:\n", df.dtypes)
print("\nCleaned Dataset Preview:")
print(df.head())


Data Cleaning Completed!
Total Rows After Cleaning: 99457
Missing Values After Cleaning:
 invoice_no            0
customer_id           0
gender                0
age                   0
category              0
quantity              0
price                 0
payment_method        0
invoice_date      59428
shopping_mall         0
dtype: int64
Data Types After Cleaning:
 invoice_no                object
customer_id               object
gender                    object
age                        int64
category                  object
quantity                   int64
price                    float64
payment_method            object
invoice_date      datetime64[ns]
shopping_mall             object
dtype: object

Cleaned Dataset Preview:
  invoice_no customer_id  gender  age  category  quantity    price  \
0    I138884     C241288  Female   28  Clothing         5  1500.40   
1    I317333     C111565    Male   21     Shoes         3  1800.51   
2    I127801     C266599    Male   20  Clothing 