In [None]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv("github_repos_merged.csv")

# Drop unwanted columns
df = df.drop(columns=["updated_at", "pushed_at", "stargazers_count", 
                      "watchers_count", "forks_count", "open_issues_count"])

# Replace empty lists '[]' with NaN in the 'topics' column
df["topics"] = df["topics"].astype(str).apply(lambda x: None if x.strip() == "[]" else x)

# Count total rows
total_rows = df.shape[0]

# Count unique IDs
unique_ids = df["id"].nunique()

print(f"Total Rows Before: {total_rows}")
print(f"Unique IDs Before: {unique_ids}")

# Check missing values with percentages
missing_values_percent = (df.isnull().sum() / len(df)) * 100

print("Missing Values before:\n", 
      pd.concat([df.isnull().sum(), missing_values_percent], axis=1, keys=['Count', 'Percentage']))

# Drop duplicate entries
df = df.drop_duplicates(subset='id', keep='first')

print("---------------------------------------------------------------------")

# Recalculate total rows after dropping duplicates
total_rows_after = df.shape[0]
unique_ids_after = df["id"].nunique()

print(f"Total Rows after duplicates removed: {total_rows_after}")
print(f"Unique IDs after duplicates removed: {unique_ids_after}")

# Check missing values with percentages
missing_values_percent = (df.isnull().sum() / len(df)) * 100

print("Missing Values after duplicates removed:\n", 
      pd.concat([df.isnull().sum(), missing_values_percent], axis=1, keys=['Count', 'Percentage']))

print("---------------------------------------------------------------------")

# Save the cleaned dataset to a new CSV file
df.to_csv("github_repos_removed_duplicates.csv", index=False)

print("Cleaned dataset saved as 'github_repos_cleaned.csv'")

Total Rows Before: 705872
Unique IDs Before: 384841
Missing Values before:
               Count  Percentage
id                0    0.000000
name              0    0.000000
full_name         0    0.000000
html_url          0    0.000000
description  386376   54.737403
created_at        0    0.000000
size              0    0.000000
language     232791   32.979209
topics       658320   93.263368
---------------------------------------------------------------------
Total Rows after duplicates removed: 384841
Unique IDs after duplicates removed: 384841
Missing Values after duplicates removed:
               Count  Percentage
id                0    0.000000
name              0    0.000000
full_name         0    0.000000
html_url          0    0.000000
description  211313   54.909170
created_at        0    0.000000
size              0    0.000000
language     125737   32.672454
topics       359402   93.389738
---------------------------------------------------------------------
✅ Cleaned data