In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## **Load the dataset into a dataframe**

#### **Read Data**

In [None]:
# Load the dataset
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/VYPrOu0Vs3I0hKLLjiPGrA/survey-data-with-duplicate.csv"
df= pd.read_csv(file_path)

#### Display the first 5 rows

In [None]:
# Display the first 5 rows
df.head()

#### Check all data types

In [None]:
for column, types in zip(df.dtypes.index, df.dtypes.values):
    print(f"{column} ---> {types}")

### **Identify and Analyze Duplicates**

#### Identify Duplicate Rows

In [None]:
# Number of duplicates
df.duplicated().value_counts()

In [None]:
# identify the first 5 rows of duplicated data
duplicated_row = df[df.duplicated()]
duplicated_row.head()

#### Analyze Characteristics of Duplicates

In [None]:
# Select 'MainBranch', 'Employment', 'RemoteWork' as the subset

sub_duplicates = df[df.duplicated(subset=['MainBranch', 'Employment', 'RemoteWork'], keep=False)]
print(f"Number of duplicated rows found: {len(sub_duplicates)}")
sub_duplicates.head()

In [None]:
# Grouping and count the number of duplicated rows
sub_duplicates_count = sub_duplicates.groupby(['MainBranch', 'Employment', 'RemoteWork']).size().reset_index(name='Count')
sub_duplicates_count.sort_values('Count', ascending=False, inplace=True)

sub_duplicates_count

In [None]:
# Check 'Country' column
print(sub_duplicates['Country'].value_counts().head())

#### Visualize Duplicates Distribution

In [None]:
import plotly.express as px

# --- Bar Chart: Top 10 Countries ---

# Filter out for top 10 countries
top_countries = sub_duplicates['Country'].value_counts().nlargest(10).reset_index()

# Shorten long country name
short_cname = {
    'United States of America' : 'USA',
    'United Kingdom of Great Britain and Northern Ireland' : 'UK'
}
top_countries['Country'] = top_countries['Country'].replace(short_cname)


# Create bar plot
fig1 = px.bar(top_countries, x='count', y='Country', orientation='h',
              title = "Top 10 Countries by Duplicate Count", text='count', color='Country')
fig1.update_layout(yaxis={'categoryorder': 'total ascending'}, showlegend=False) # sort bar
fig1.show()


# --- Pie Chart: Employment ---
# 1. Calculate the counts
counts = sub_duplicates['Employment'].value_counts().reset_index()
counts.columns = ['Employment', 'Count']

# 2. Filter: Keep big categories, group small ones
threshold = 1000  # Adjust this number based on your data!
main_categories = counts[counts['Count'] >= threshold]
others = counts[counts['Count'] < threshold]

# Create a new "Other" row
other_row = pd.DataFrame([{'Employment': 'Other', 'Count': others['Count'].sum()}])
final_counts = pd.concat([main_categories, other_row])

# 3. Plot the clean version
fig2 = px.pie(final_counts, values='Count', names='Employment', 
             title='Employment Status (Grouped)',
             hole=0.3)
fig2.update_layout(showlegend=False)
fig2.show()

#### Strategic Removal of Duplicates

In [None]:
# Check duplicates in unique column like 'ResponseId'

if 'ResponseId' in df.columns:
    print("Duplicates based on Response ID:", df.duplicated(subset=['ResponseId']).sum())

else:
    print("No duplicate found")

In [None]:
# Remove the duplicates
df.drop_duplicates(subset=['ResponseId'], inplace=True)

print(f"Duplicates removed. New Shape: {df.shape}")

# Documentation of Duplicate Handling

## 1. Identification of Duplicates
The duplicate identification process was conducted in two stages:

* **Complete Row Duplicates:**
    * **Method:** Scanned for rows where *every single column* was identical.
    * **Result:** 20 duplicates found.
    * **Analysis:** These were deemed to be data entry errors or accidental double-submissions.

* **Subset Duplicates (MainBranch, Employment, RemoteWork):**
    * **Method:** Scanned for duplicates based only on job profile columns.
    * **Result:** Over 65,000 matches found.
    * **Analysis:** These were determined to be *false positives*. They represent distinct individuals who happen to share the same job characteristics (e.g., Full-time Developers working remotely).

## 2. Removal Strategy and Reasoning
**Decision:** Duplicates were removed based on the `Respondent` (ID) column.

**Reasoning:**
* **Why not the Job Subset?** Removing duplicates based on `['MainBranch', 'Employment', 'RemoteWork']` would have deleted valid data from thousands of unique users, severely biasing the demographic analysis.
* **Why Respondent ID?** The `Respondent` column serves as a unique primary key. If an ID appears twice, it confirms the same user exists multiple times in the dataset. Removing these ensures data integrity without losing valid responses from similar-looking users.