### Size of the github repo

In [1]:
import subprocess
import os
import pandas as pd

# Settings
pd.set_option('display.max_colwidth', None)

In [2]:
# The URL of your specific repository
repo_url = 'https://github.com/guillermo-navas-palencia/optbinning.git'
repo_name = repo_url.split('/')[-1].replace('.git', '')

# List to hold file information
files_data = []

# Walk through the repository directory
for root, dirs, files in os.walk(repo_name):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            # Determine the file's category
            if ".git" in file_path:
                category = "Git-related"
            elif file_path.endswith('.py'):
                category = "Python"
            elif file_path.endswith('.ipynb'):
                category = "Jupyter Notebook"
            else:
                category = "Other"

            # Count lines in each file
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                line_count = sum(1 for line in f)

            # Extract directory name and file name
            directory_name = os.path.relpath(root, repo_name)
            files_data.append({
                "Directory": directory_name,
                "File Name": file,
                "Line Count": line_count,
                "Category": category
            })
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# Create a pandas DataFrame
df = pd.DataFrame(files_data)

# Summing line counts by category
df.groupby(['Category']).agg({'File Name': ['count'], \
                            'Line Count': ['sum']}).reset_index()

Unnamed: 0_level_0,Category,File Name,Line Count
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,Git-related,28,86819
1,Jupyter Notebook,20,36464
2,Other,70,20318
3,Python,100,31174


In [5]:
# Directory size
kala = df.groupby(['Directory']).agg({'File Name': ['count'], \
                            'Line Count': ['sum']}).reset_index()

kala.columns = ['Directory', 'count', 'Line Count sum']
kala.sort_values('Line Count sum', ascending=False).head(10).reset_index()

Unnamed: 0,index,Directory,count,Line Count sum
0,7,.git\objects\pack,2,85726
1,15,doc\source\tutorials,21,36516
2,17,optbinning\binning,23,11661
3,13,doc\source\_images,10,9945
4,28,tests\results,27,6010
5,25,tests,18,4653
6,19,optbinning\binning\multidimensional,10,4102
7,18,optbinning\binning\distributed,9,2567
8,20,optbinning\binning\piecewise,8,2520
9,26,tests\data,2,2253


In [4]:
# Largest files
df.sort_values('Line Count', ascending=False).head(20).reset_index()

Unnamed: 0,index,Directory,File Name,Line Count,Category
0,30,.git\objects\pack,pack-75c2c9b8daa2b9bdb4a453998b20a9ac78ae3ecf.pack,84714,Git-related
1,84,doc\source\_images,binning_data_stream.gif,8737,Other
2,63,doc\source\tutorials,tutorial_binning_2d.ipynb,4964,Jupyter Notebook
3,59,doc\source\tutorials,tutorial_binary.ipynb,3794,Jupyter Notebook
4,62,doc\source\tutorials,tutorial_binary_under_uncertainty.ipynb,3762,Jupyter Notebook
5,68,doc\source\tutorials,tutorial_continuous.ipynb,2882,Jupyter Notebook
6,70,doc\source\tutorials,tutorial_counterfactual_binary_target.ipynb,2358,Jupyter Notebook
7,77,doc\source\tutorials,tutorial_scorecard_monitoring.ipynb,2112,Jupyter Notebook
8,104,optbinning\binning,binning_statistics.py,2089,Python
9,73,doc\source\tutorials,tutorial_piecewise_binary.ipynb,2044,Jupyter Notebook
