In [11]:
import pandas as pd
import numpy as np
from dateutil import parser

# Datos simulados
np.random.seed(42)
data = pd.DataFrame({
    'Name': [' Ana ', 'Luis', 'carlos', 'ANA', np.nan, 'Luis'],
    'Age': [25, 200, 35, np.nan, 29, 25],
    'Income': [3000, 4500, np.nan, 3200, 3100, 3000],
    'Sex': ['F', 'M', 'Male', 'female', 'F', 'M'],
    'Date': ['2022-01-01', '2021-13-01', '2022/02/30', '01-05-2022', None, '2022-01-01']
})
print(data)
print()
data_c = data.copy()

# clean basic
data_c['Name'] = data_c['Name'].str.strip().str.title()
data_c.loc[(data_c['Age'] > 120) | (data_c['Age'] < 0), 'Age'] = np.nan
data_c['Age'] = data_c['Age'].fillna(data_c['Age'].median()).astype(int)
data_c['Income'] = data_c['Income'].fillna(data_c['Income'].mean())
data_c['Name'] = data_c['Name'].fillna('Unknown')

# clean of sex
def clean_sex(g):
    if isinstance(g, str):
        g = g.lower()
        if 'f' in g:
            return 'F'
        elif 'm' in g:
            return 'M'
    return 'Otro'
data_c['Sex'] = data_c['Sex'].apply(clean_sex)

# clean of date
from datetime import datetime

def try_parse_date(value):
    if pd.isnull(value):
        return pd.NaT
    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%m-%d-%Y", "%d/%m/%Y", "%Y-%d-%m"):
        try:
            return pd.to_datetime(value, format=fmt, errors='raise')
        except:
            continue
    try:
        return parser.parse(str(value), fuzzy=True)
    except:
        return pd.NaT

data_c['Date_Original'] = data_c['Date']
data_c['Date'] = data_c['Date'].apply(try_parse_date)
data_c = data_c.drop_duplicates()
data_c['IncomeAge'] = data_c['Income'] / data_c['Age']
data_encoded = pd.get_dummies(data_c, columns=['Sex'], drop_first=True)

print("\n=== Clean Data ===")
print(data_encoded)


     Name    Age  Income     Sex        Date
0    Ana    25.0  3000.0       F  2022-01-01
1    Luis  200.0  4500.0       M  2021-13-01
2  carlos   35.0     NaN    Male  2022/02/30
3     ANA    NaN  3200.0  female  01-05-2022
4     NaN   29.0  3100.0       F        None
5    Luis   25.0  3000.0       M  2022-01-01


=== Clean Data ===
      Name  Age  Income       Date Date_Original   IncomeAge  Sex_M
0      Ana   25  3000.0 2022-01-01    2022-01-01  120.000000  False
1     Luis   27  4500.0 2021-01-13    2021-13-01  166.666667   True
2   Carlos   35  3360.0        NaT    2022/02/30   96.000000   True
3      Ana   27  3200.0 2022-05-01    01-05-2022  118.518519  False
4  Unknown   29  3100.0        NaT          None  106.896552  False
5     Luis   25  3000.0 2022-01-01    2022-01-01  120.000000   True
