In [75]:
import pandas as pd
import numpy as np

# We simulate a data set
data = {
    'Name': ['  John Doe', 'Jane Smith', '   Mary Johnson', 'Jane Smith', np.nan, 'Luke Skywalker'],
    'Age': [28, 34, np.nan, 34, 40, 25],
    'Gender': ['male', 'female', 'female', 'female', 'male', 'male'],
    'Income': ['3000$', '4500$', '3500$', '4500$', '5000$', '3200$'],
    'City': ['New York', 'London', '   Paris', 'London', 'Berlin', 'Paris'],
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the original DataFrame
print("Datos originales:")
print(df)

# Clean up extra spaces in text columns
df['Name'] = df['Name'].str.strip()
df['City'] = df['City'].str.strip()

# Remove duplicates
df = df.drop_duplicates()

# Format the 'Income' column (remove the dollar sign and convert to numeric)
df['Income'] = df['Income'].replace({'\$': ''}, regex=True).astype(int)

# Convert the 'Age' column to an integer
df['Age'] = df['Age'].fillna(df['Age'].mean()).astype(int)

# Rename columns to make them more readable
df = df.rename(columns={'Name': 'FullName', 'Gender': 'Sex'})

# Display the clean DataFrame
print("\nDatos después de la limpieza:")
print(df)


Datos originales:
              Name   Age  Gender Income      City
0         John Doe  28.0    male  3000$  New York
1       Jane Smith  34.0  female  4500$    London
2     Mary Johnson   NaN  female  3500$     Paris
3       Jane Smith  34.0  female  4500$    London
4              NaN  40.0    male  5000$    Berlin
5   Luke Skywalker  25.0    male  3200$     Paris

Datos después de la limpieza:
         FullName  Age     Sex  Income      City
0        John Doe   28    male    3000  New York
1      Jane Smith   34  female    4500    London
2    Mary Johnson   31  female    3500     Paris
4             NaN   40    male    5000    Berlin
5  Luke Skywalker   25    male    3200     Paris
