In [1]:
import os
import git
import re
import tqdm
import time
from IPython.display import display, clear_output
import sys
import shutil

In [2]:
repo_list = ['https://github.com/matplotlib/matplotlib.git',
             'https://github.com/scikit-learn/scikit-learn.git',
             'https://github.com/numpy/numpy.git',
             'https://github.com/pandas-dev/pandas.git',
             'https://github.com/django/django.git',
             'https://github.com/scipy/scipy.git',
             'https://github.com/pallets/flask.git',
             'https://github.com/psf/requests.git']

In [3]:
class ProgressBar(git.RemoteProgress):

    def __init__(self) -> None:
        super().__init__()

        # create tqdm object
        self.bar = tqdm.tqdm()

        # record start time
        self.start_time = time.time()
        
        # create status_printer function
        self.printer = self.bar.status_printer(sys.stdout)

    # update gets called when git fetch progress changes
    def update(self, op_code, cur_count, max_count=None, message=""):

        elapsed_time = time.time() - self.start_time
        
        progress = self.bar.format_meter(n=cur_count, total=max_count, elapsed=elapsed_time)
        
        self.printer(progress)

In [4]:
pattern = re.compile('[a-z]+')

cleaned_python = open('python.txt', 'w')

# loop over repository urls
for i in range(len(repo_list)):

    # create a repository to clone into
    temp_repo = git.Repo.init(os.path.join(os.getcwd(), 'temp'))

    origin = temp_repo.create_remote('origin', repo_list[i])

    print('Fetching ' + repo_list[i] + '...')

    origin.fetch(progress=ProgressBar())
    
    temp_repo.create_head('main', origin.refs.main)
    temp_repo.heads.main.set_tracking_branch(origin.refs.main)
    temp_repo.heads.main.checkout()

    origin.pull()

    for root, dirs, files in os.walk('temp'):
    
        for file in files:
            name_ext = file.split('.')

            if len(name_ext)==1:
                pass

            elif name_ext[1] == 'py':
                f = open(os.path.join(root, file), 'r')

                for line in f:
                    cleaned_python.write(' '.join(pattern.findall(line.lower())) + '\n')

                f.close()

    # clean up repository
    shutil.rmtree('temp')

    pass

cleaned_python.close()

Fetching https://github.com/matplotlib/matplotlib.git...


0it [00:00, ?it/s]

100%|██████████| 209241.0/209241.0 [03:20<00:00, 1042.43it/s]

0it [03:21, ?it/s]


Fetching https://github.com/scikit-learn/scikit-learn.git...


0it [00:00, ?it/s]

100%|██████████| 171697.0/171697.0 [01:15<00:00, 2264.48it/s]

0it [01:16, ?it/s]


Fetching https://github.com/numpy/numpy.git...


0it [00:00, ?it/s]

100%|██████████| 185493.0/185493.0 [00:56<00:00, 3309.26it/s]

0it [00:56, ?it/s]


Fetching https://github.com/pandas-dev/pandas.git...


0it [00:00, ?it/s]

100%|██████████| 284082.0/284082.0 [01:35<00:00, 2962.35it/s]

0it [01:36, ?it/s]


Fetching https://github.com/django/django.git...


0it [00:00, ?it/s]

100%|██████████| 350135.0/350135.0 [00:41<00:00, 8425.55it/s] 

0it [00:43, ?it/s]


Fetching https://github.com/scipy/scipy.git...


0it [00:00, ?it/s]

100%|██████████| 164116.0/164116.0 [01:08<00:00, 2392.71it/s]

0it [01:09, ?it/s]


Fetching https://github.com/pallets/flask.git...


0it [00:00, ?it/s]

100%|██████████| 15583.0/15583.0 [00:03<00:00, 4073.71it/s]

0it [00:03, ?it/s]


Fetching https://github.com/psf/requests.git...


0it [00:00, ?it/s]

100%|██████████| 16551.0/16551.0 [00:02<00:00, 5895.08it/s]

0it [00:02, ?it/s]
