# **REMOVING DUPLICATES**

#### IMPORT REQUIRED LIBRARIES

In [None]:
import pandas as pd

#### LOAD THE DATASET

In [None]:
# # Load the Dataset
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/VYPrOu0Vs3I0hKLLjiPGrA/survey-data-with-duplicate.csv"

df = pd.read_csv(file_path)

print("Dataset suceesfully loaded!")

In [None]:
# Check the first 5 rows
df.head()

#### IDENTIFYING DUPLICATE ROWS

In [None]:
# --- FILTER ONLY DUPLICATED ---

# Let's pick a column that has unique values like 'ResponseId'
duplicated = df[df.duplicated(subset=['ResponseId'])]

# Count number of duplicated rows
print(f"Number of duplicates found: {df.duplicated(subset=['ResponseId']).sum()} rows")

# Check first 5 rows
duplicated.head()

#### REMOVING DUPLICATE ROWS

In [None]:
df.drop_duplicates(subset=['ResponseId'], inplace=True)

print(f"Duplicates removed. New Shape: {df.shape}")

#### HANDLING MISSING VALUES

In [None]:
# Count missing values for all columns
missing_data = df.isna().sum()

print(f"Number of missing data:\n{missing_data[missing_data > 0].to_string()}")

In [None]:
# Check most frequent value for 'Edlevel' column
most_freqv = df['EdLevel'].mode()[0]
print(f"The most frequent education level: {most_freqv}")

# Fill missing value with the most frequent value
df['EdLevel'] = df['EdLevel'].fillna(most_freqv)

# Verify the fix
# This should print 0
print(f"Missing value in 'EdLevel' after imputation: {df['EdLevel'].isna().sum()}")

#### Normalizing Compensation Data

For compensation/salaries, we almost always use median(the middle number) instead of mean(average).
* **Why?** salaries are usually "skewed". A few people making huge amount (like > $2.000.000), this will pull the **Average** up to high, making it inacurate for a *Typical* person. The **Median** is safer because it ignore the *outliers*.

In [None]:
# Check missing value in 'ConvertedCompYearly'
miss_comp = df['ConvertedCompYearly'].isna().sum()
print(f"Number of missing compensation values: {miss_comp}")

# Find median for 'ConvertedCompYearly' column
comp_median = df['ConvertedCompYearly'].median()
print(f"The median yearly compensation is ${comp_median}")

# Fill missing data using median value
df['ConvertedCompYearly'] = df['ConvertedCompYearly'].fillna(comp_median)

# Let's verify the fix
print(f"Missing values after imputation: {df['ConvertedCompYearly'].isna().sum()}")