# **REMOVING DUPLICATES**

#### IMPORT REQUIRED LIBRARIES

In [1]:
import pandas as pd

#### LOAD THE DATASET

In [11]:
# # Load the Dataset
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/VYPrOu0Vs3I0hKLLjiPGrA/survey-data-with-duplicate.csv"

df = pd.read_csv(file_path)

print("Dataset suceesfully loaded!")

Dataset suceesfully loaded!


In [12]:
# Check the first 5 rows
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


#### IDENTIFYING DUPLICATE ROWS

In [23]:
# --- FILTER ONLY DUPLICATED ---

# Let's pick a column that has unique values like 'ResponseId'
duplicated = df[df.duplicated(subset=['ResponseId'])]

# Count number of duplicated rows
print(f"Number of duplicates found: {df.duplicated(subset=['ResponseId']).sum()} rows")

# Check first 5 rows
duplicated.head()

Number of duplicates found: 20 rows


Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
65437,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
65438,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
65439,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
65440,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
65441,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


#### REMOVING DUPLICATE ROWS

In [25]:
df.drop_duplicates(subset=['ResponseId'], inplace=True)

print(f"Duplicates removed. New Shape: {df.shape}")

Duplicates removed. New Shape: (65437, 114)


#### HANDLING MISSING VALUES

In [37]:
# Count missing values for all columns
missing_data = df.isna().sum()

print(f"Number of missing data:\n{missing_data[missing_data > 0].to_string()}")

Number of missing data:
RemoteWork                        10631
CodingActivities                  10971
EdLevel                            4653
LearnCode                          4949
LearnCodeOnline                   16200
TechDoc                           24540
YearsCode                          5568
YearsCodePro                      13827
DevType                            5992
OrgSize                           17957
PurchaseInfluence                 18031
BuyNewTool                        20256
BuildvsBuy                        22079
TechEndorse                       21769
Country                            6507
Currency                          18753
CompTotal                         31697
LanguageHaveWorkedWith             5692
LanguageWantToWorkWith             9685
LanguageAdmired                   14565
DatabaseHaveWorkedWith            15183
DatabaseWantToWorkWith            22879
DatabaseAdmired                   26880
PlatformHaveWorkedWith            23071
PlatformWantToWo

In [52]:
# Check most frequent value for 'Edlevel' column
most_freqv = df['EdLevel'].mode()[0]
print(f"The most frequent education level: {most_freqv}")

# Fill missing value with the most frequent value
df['EdLevel'] = df['EdLevel'].fillna(most_freqv)

# Verify the fix
# This should print 0
print(f"Missing value in 'EdLevel' after imputation: {df['EdLevel'].isna().sum()}")

The most frequent education level: Bachelor’s degree (B.A., B.S., B.Eng., etc.)
Missing value in 'EdLevel' after imputation: 0


#### Normalizing Compensation Data

For compensation/salaries, we almost always use median(the middle number) instead of mean(average).
* **Why?** salaries are usually "skewed". A few people making huge amount (like > $2.000.000), this will pull the **Average** up to high, making it inacurate for a *Typical* person. The **Median** is safer because it ignore the *outliers*.

In [57]:
# Check missing value in 'ConvertedCompYearly'
miss_comp = df['ConvertedCompYearly'].isna().sum()
print(f"Number of missing compensation values: {miss_comp}")

# Find median for 'ConvertedCompYearly' column
comp_median = df['ConvertedCompYearly'].median()
print(f"The median yearly compensation is ${comp_median}")

# Fill missing data using median value
df['ConvertedCompYearly'] = df['ConvertedCompYearly'].fillna(comp_median)

# Let's verify the fix
print(f"Missing values after imputation: {df['ConvertedCompYearly'].isna().sum()}")

Number of missing compensation values: 42002
The median yearly compensation is $65000.0
Missing values after imputation: 0
