In [5]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv("github_repos_removed_duplicates.csv")

# Count total rows
total_rows = df.shape[0]

# Count unique IDs
unique_ids = df["id"].nunique()

print(f"Total Rows Before: {total_rows}")
print(f"Unique IDs Before: {unique_ids}")

# Check missing values with percentages
missing_values_percent = (df.isnull().sum() / len(df)) * 100

print("Missing Values before:\n", 
      pd.concat([df.isnull().sum(), missing_values_percent], axis=1, keys=['Count', 'Percentage']))

print("-----------------------------------------------------------------------------------")

# Define size brackets (in KB)
bins = [0, 10, 50, 100, 500, 1000, 5000, 10000, df["size"].max()]
labels = ["0-10", "10-50", "50-100", "100-500", "500-1000", "1K-5K", "5K-10K", "10K+"]

# Create a new column for size brackets
df["size_bracket"] = pd.cut(df["size"], bins=bins, labels=labels, right=False)

# Group by size bracket and calculate statistics
summary = df.groupby("size_bracket").agg(
    total_repos=("id", "count"),
    missing_description=("description", lambda x: x.isna().mean() * 100),
    missing_topics=("topics", lambda x: x.isna().mean() * 100),
    missing_language=("language", lambda x: x.isna().mean() * 100),
).reset_index()

# Print summary
print(summary)

# Remove repositories smaller than 100 KB
df = df[df["size"] >= 100]

# Save the filtered dataset
df.to_csv("github_repos_over_100kb.csv", index=False)


Total Rows Before: 384841
Unique IDs Before: 384841
Missing Values before:
               Count  Percentage
id                0    0.000000
name              0    0.000000
full_name         0    0.000000
html_url          0    0.000000
description  211313   54.909170
created_at        0    0.000000
size              0    0.000000
language     125737   32.672454
topics       359402   93.389738
-----------------------------------------------------------------------------------


  summary = df.groupby("size_bracket").agg(


  size_bracket  total_repos  missing_description  missing_topics  \
0         0-10       139692            63.270624       97.353463   
1        10-50        69982            50.161470       93.632648   
2       50-100        31336            44.195175       92.867628   
3      100-500        49298            50.020285       90.060449   
4     500-1000        17289            52.096709       89.108682   
5        1K-5K        35350            52.758133       89.442716   
6       5K-10K        12703            53.837676       89.860663   
7         10K+        29190            50.774238       88.872902   

   missing_language  
0         57.169344  
1         22.041382  
2         26.094588  
3         12.600917  
4         15.998612  
5         18.016973  
6         18.310635  
7         15.758822  
