In [None]:



# Import libraries
import pandas as pd
import matplotlib.pyplot as plt

# 1️⃣ Load data from tab-separated file
Customers_df = pd.read_csv("Customers.csv")

# 2️⃣ Print column types before cleaning
print("Before:")
print(Customers_df.dtypes)


# 3️⃣ Convert columns to proper data types
Customers_df["ID"] = pd.to_numeric(Customers_df["ID"], errors="coerce").astype("Int64")  # Nullable int
Customers_df["Name"] = Customers_df["Name"].astype("string")  # Convert to string
Customers_df["Age"] = pd.to_numeric(Customers_df["Age"], errors="coerce").astype("Int64")  # Nullable int
Customers_df["City"] = Customers_df["City"].astype("string")  # Convert to string

# 4️⃣ Print column types after cleaning
print("After:")
print(Customers_df.dtypes)
print()

# 5️⃣ Fill missing Age values with mean age
mean_age = int(Customers_df['Age'].mean())  # Calculate mean and convert to int
Customers_df['Age'] = Customers_df['Age'].fillna(mean_age)  # Fill NaN with mean

# 6️⃣ Display first 5 rows
print(Customers_df.head())
print()

# 7️⃣ Clean Name and City columns: remove extra spaces and convert to lowercase
Customers_df["Name"] = Customers_df["Name"].str.strip().str.lower()
Customers_df["City"] = Customers_df["City"].str.strip().str.lower()

# 8️⃣ Correct misspelled names using a dictionary
name_corrections = {
    "aly": "ali",
    "ali": "ali",
    "omar": "omar",
    "omaar": "omar",
    "tmr": "tamer",
    "tamr": "tamer",
    "yocef": "youssef",
    "youcef": "youssef",
    "yousef": "youssef",
    "yoUssef": "youssef",
    "mohamed": "mohamed",
    "mohammad": "mohamed",
    "muhammad": "mohamed",
    "mo7amed": "mohamed",
    "m7moud": "mahmoud",
    "smr": "sameer",
    "slma": "salma",
    "saalma": "salma",
    "hba": "heba",
    "hibah": "heba",
    "nora": "noura",
    "noora": "noura",
    "fatma": "fatma",
    "fatima": "fatma",
    "ftoom": "fatma",
    "ahmad": "ahmed",
    "a7med": "ahmed",
    "ahmd": "ahmed",
    "krim": "karim",
    "krym": "karim",
    "karrem": "karim",
    "may": "mai",
    "maai": "mai",
    "yarah": "yara",
    "yra": "yara",
    "yara": "yara",
}

Customers_df["Name"] = (Customers_df["Name"].replace(name_corrections).str.capitalize())  # Capitalize first letter

# 9️⃣ Clean City column: remove extra spaces, capitalize, fill empty or NaN values
Customers_df["City"] = Customers_df["City"].str.strip().str.capitalize()
Customers_df["City"] = Customers_df["City"].replace("", pd.NA)  # Replace empty strings with NaN
Customers_df["City"] = Customers_df["City"].fillna("unknown")  # Fill NaN with "unknown"

# 1️⃣0️⃣ Check frequency of cities
print(Customers_df["City"].value_counts())

# 1️⃣1️⃣ Display last rows
print(Customers_df.tail())

# 1️⃣2️⃣ Plot Age distribution histogram
plt.hist(Customers_df["Age"], bins=10)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()
print()

# 1️⃣3️⃣ Plot top 10 cities by count
Customers_df["City"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Cities")
plt.xlabel("City")
plt.ylabel("Count")
plt.show()
print()

# 1️⃣4️⃣ Calculate average age per city
avg_age_per_city = Customers_df.groupby("City")["Age"].mean()
print(avg_age_per_city)

# 1️⃣5️⃣ Plot average age per city
avg_age_per_city.plot(kind="bar", figsize=(10,5))
plt.title("Average Age per City")
plt.xlabel("City")
plt.ylabel("Average Age")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 1️⃣6️⃣ Save cleaned data to tab-separated file
Customers_df.to_csv('Clean_Customers.csv',  index=False)

# 1️⃣7️⃣ Read cleaned file and display last 20 rows
Customers = pd.read_csv('Clean_Customers.csv' )
print(Customers.tail(20))
