In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
df = pd.read_csv('../Ressources/Data/Camp_Market.csv', sep=',')

In [None]:
print(pd.get_option("display.max_rows"))
print(pd.get_option("display.max_columns"))

In [None]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce').dt.strftime('%d/%m/%Y')

In [None]:
cols_to_check = [
    'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
    'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
    'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
    'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'
]

duplicate_mask = df.duplicated(subset=cols_to_check, keep='first')

df = df.drop_duplicates(subset=cols_to_check, keep='first').reset_index(drop=True)
df.to_csv("../Ressources/Data/Camp_Market_Cleaned.csv", index=False)

In [None]:
df['Marital_Status'] = df['Marital_Status'].replace({
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single',
    'Divorced': 'Single',
    'Widow': 'Single',
    'Married': 'Couple',
    'Together': 'Couple'
})

df['Education'] = df['Education'].replace({
    'Basic': 'Undergrad',
    '2n Cycle': 'Undergrad',
    'Graduation': 'Graduate',
    'Master': 'Postgrad',
    'PhD': 'Postgrad'
})

df.rename(columns={'Dt_Customer': 'Customer_For'}, inplace=True)
df.rename(columns={'Recency': 'Last_Purchase'}, inplace=True)

df['Childrens'] = df['Kidhome'] + df['Teenhome']
df['Spent'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)
df['Family_Size'] = df['Kidhome'] + df['Teenhome'] + df['Marital_Status'].map({'Single': 1, 'Couple': 2})
df['Age'] = datetime.now().year - df['Year_Birth']

df = df.drop(columns=['Kidhome', 'Teenhome'])  


In [None]:
missing_idx = df['Income'].isna()  

def impute_income(row, df):
    if pd.notna(row['Income']):
        return row['Income'] 
    
    mask = (
        (df['Year_Birth'] == row['Year_Birth']) &
        (df['Education'] == row['Education']) &
        (df['Marital_Status'] == row['Marital_Status']) &
        (df['Income'].notna())
    )
    
    group_values = df.loc[mask, 'Income']
    
    if len(group_values) > 0:
        return round(group_values.mean()) 
    else:
        return np.nan

df['Income'] = df.apply(lambda row: impute_income(row, df), axis=1).astype('Int64')


In [None]:
display(df[missing_idx & df['Income'].notna()])

In [None]:
display(df)

In [None]:
df.to_csv("../Ressources/Data/Camp_Market_Cleaned.csv", index=False)