## Python Source Code Scraping

In [1]:
import os
import git
import re
import tqdm
import time
from IPython.display import display, clear_output
import sys
import shutil
import ast
import numpy as np
import pandas as pd

Create a list of eight popular python package repositories (matplotlib, sklearn, numpy, pandas, django, scipy, flask, and requests):

In [2]:
repo_urls = ['https://github.com/matplotlib/matplotlib.git',
             'https://github.com/scikit-learn/scikit-learn.git',
             'https://github.com/numpy/numpy.git',
             'https://github.com/pandas-dev/pandas.git',
             'https://github.com/django/django.git',
             'https://github.com/scipy/scipy.git',
             'https://github.com/pallets/flask.git',
             'https://github.com/psf/requests.git']

Define a subclass of GitPython's RemoteProgress to track progress when fetching sources:

In [3]:
class ProgressBar(git.RemoteProgress):

    def __init__(self) -> None:
        super().__init__()

        # create tqdm object
        self.bar = tqdm.tqdm()

        # record start time
        self.start_time = time.time()
        
        # create status_printer function
        self.printer = self.bar.status_printer(sys.stdout)

    # update gets called when git fetch progress changes
    def update(self, op_code, cur_count, max_count=None, message=""):

        elapsed_time = time.time() - self.start_time
        
        progress = self.bar.format_meter(n=cur_count, total=max_count, elapsed=elapsed_time)
        
        self.printer(progress)

In [4]:
def scrape_py(py, out_file, ast_dict, ast_total):
    '''
    takes two filestreams
    '''

    token_re = re.compile(r'[a-z_]+')
    line_comment_re = re.compile(r'[ \t]*#.*')
    block_comment_re = re.compile(r'(\'\'\')|(\"\"\")')

    code = py.read()
    try:
        ast_tree = ast.parse(code)

        for node in ast.walk(ast_tree):
            if isinstance(node, ast.Name):
                ast_total += 1
                node_id = node.id.lower()
                if node_id in ast_dict:
                    ast_dict[node_id] += 1
                else:
                    ast_dict[node_id] = 1
    
    except SyntaxError:
        pass

    py.seek(0)

    # boolean to track whether we are in a block comment
    in_block_comment = False

    for line in py:

        # check for block comment start/end
        block_matches = block_comment_re.findall(line)

        # if only one match, begin/end multiline comment
        if len(block_matches) == 1:
            in_block_comment = not in_block_comment  
        
        # if two matches, ignore the line
        elif len(block_matches) == 2:
            in_block_comment = False
        
        elif not in_block_comment:

            # remove line/inline comments
            no_comments = line_comment_re.sub('', line)

            # get lowercase alphabetic characters
            clean_line = ' '.join(token_re.findall(no_comments.lower()))+'\n'

            # check for empty line
            if clean_line != '\n':
                out_file.write(clean_line)
    
    return ast_total

Loop over the list of repositories. For each one, create an empty local git repository, clone the remote repository into the local one, then scrape all the python files:

In [5]:
training_data = open('python.txt', 'w')
identifiers = {}
id_total = 0

# loop over repository urls
for i in range(len(repo_urls)):

    # create empty repository to clone into
    temp_repo = git.Repo.init(os.path.join(os.getcwd(), 'temp'))

    # add repo url as remote origin
    origin = temp_repo.create_remote('origin', repo_urls[i])

    # fetch remote objects
    print('Fetching ' + repo_urls[i] + '...')
    origin.fetch(progress=ProgressBar())
    
    # set up local branch to track remote
    temp_repo.create_head('main', origin.refs.main)
    temp_repo.heads.main.set_tracking_branch(origin.refs.main)
    
    # checkout local branch and pull
    temp_repo.heads.main.checkout()
    origin.pull()

    # loop over subdirectories repo directory
    for root, dirs, files in os.walk('temp'):
        
        # loop over files in subdirectory
        for file in files:

            name_ext = file.split('.')

            # ignore files with no extension
            if len(name_ext)==1:
                pass
            
            # check for '.py' files
            elif name_ext[1] == 'py':

                py = open(os.path.join(root, file), 'r')

                id_total = scrape_py(py, training_data, identifiers, id_total)

                py.close()

    # delete repository
    shutil.rmtree('temp')

training_data.close()

Fetching https://github.com/matplotlib/matplotlib.git...


0it [00:00, ?it/s]

100%|██████████| 209264.0/209264.0 [03:51<00:00, 902.96it/s] 

0it [03:52, ?it/s]


Fetching https://github.com/scikit-learn/scikit-learn.git...


0it [00:00, ?it/s]

100%|██████████| 171766.0/171766.0 [01:47<00:00, 1603.77it/s]

0it [01:47, ?it/s]


Fetching https://github.com/numpy/numpy.git...


0it [00:00, ?it/s]

100%|██████████| 185582.0/185582.0 [01:03<00:00, 2911.29it/s]

0it [01:04, ?it/s]


Fetching https://github.com/pandas-dev/pandas.git...


0it [00:00, ?it/s]

100%|██████████| 284310.0/284310.0 [00:52<00:00, 5379.93it/s]

0it [00:53, ?it/s]


Fetching https://github.com/django/django.git...


0it [00:00, ?it/s]

100%|██████████| 350214.0/350214.0 [02:59<00:00, 1955.90it/s]

0it [03:00, ?it/s]


Fetching https://github.com/scipy/scipy.git...


0it [00:00, ?it/s]

100%|██████████| 164178.0/164178.0 [00:27<00:00, 5991.13it/s]

0it [00:28, ?it/s]


Fetching https://github.com/pallets/flask.git...


0it [00:00, ?it/s]

100%|██████████| 15591.0/15591.0 [00:04<00:00, 3470.16it/s]

0it [00:04, ?it/s]


Fetching https://github.com/psf/requests.git...


0it [00:00, ?it/s]

100%|██████████| 16525.0/16525.0 [00:03<00:00, 5130.92it/s]

0it [00:03, ?it/s]


In [22]:
arr = np.asarray(list(identifiers.items()))
count = np.array(arr[:,1], dtype=int).reshape((arr.shape[0], 1))
rate = count/id_total


df = pd.DataFrame()
df['identifier'] = arr[:,0]
df['total'] = count
df['rate'] = rate

df.sort_values('total', ascending=False, inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.to_csv('identifier_frequency.csv')