# 📝 Detect & Fix Invalid Data Formats in Pandas

## **1️⃣ Load Sample Data**

In [None]:

import pandas as pd

# Sample DataFrame with incorrect formats
data = {
    'fare_amount': ['12.50', '15.00', 'invalid', '-10', '25.75', 'free', '30.00'],
    'trip_distance': ['2.5', 'invalid', '5.0', '-3', '10', '7.2', 'unknown'],
    'email': ['test@example.com', 'invalid-email', 'user@domain', 'hello@gmail.com']
}
df = pd.DataFrame(data)

# Display the dataset
print("Original Data:")
display(df)
    

## **2️⃣ Detect Invalid Rows**

In [None]:

# Detect non-numeric fare_amount values
invalid_fares = df[~df['fare_amount'].astype(str).str.match(r'^\d+(\.\d+)?$')]

# Detect non-numeric trip_distance values
invalid_distances = df[~df['trip_distance'].astype(str).str.match(r'^\d+(\.\d+)?$')]

# Detect invalid email format
invalid_emails = df[~df['email'].str.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')]

# Count invalid rows
print(f"Number of invalid fares: {len(invalid_fares)}")
print(f"Number of invalid distances: {len(invalid_distances)}")
print(f"Number of invalid emails: {len(invalid_emails)}")

# Display invalid rows
print("Invalid Fares:")
display(invalid_fares)

print("Invalid Distances:")
display(invalid_distances)

print("Invalid Emails:")
display(invalid_emails)
    

## **3️⃣ Fix Invalid Data**

In [None]:

# Convert invalid fares to NaN
df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')

# Convert invalid distances to NaN
df['trip_distance'] = pd.to_numeric(df['trip_distance'], errors='coerce')

# Replace invalid emails with NaN
df.loc[~df['email'].str.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'), 'email'] = None

# Fill missing fare and distance values with their respective means
df['fare_amount'].fillna(df['fare_amount'].mean(), inplace=True)
df['trip_distance'].fillna(df['trip_distance'].mean(), inplace=True)

# Display cleaned dataset
print("Cleaned Data:")
display(df)
    