## Python Source Code Scraping

In [1]:
import os
import git
import re
import tqdm
import time
from IPython.display import display, clear_output
import sys
import shutil
import ast
import numpy as np
import pandas as pd

In [2]:
def clean_snippet(snippet, line_breaks=True):
    token_re = re.compile(r'[a-z_]+')
    line_comment_re = re.compile(r'[ \t]*#.*')
    block_comment_re = re.compile(r'(\'\'\')|(\"\"\")')

    output = ''

    lines = snippet.split('\n')
    # boolean to track whether we are in a block comment
    in_block_comment = False

    for line in lines:

        # check for block comment start/end
        block_matches = block_comment_re.findall(line)

        # if only one match, begin/end multiline comment
        if len(block_matches) == 1:
            in_block_comment = not in_block_comment  
        
        # if two matches, ignore the line
        elif len(block_matches) == 2:
            in_block_comment = False
        
        elif not in_block_comment:

            # remove line/inline comments
            no_comments = line_comment_re.sub('', line)

            # get lowercase alphabetic characters
            clean_line = ' '.join(token_re.findall(no_comments.lower()))

            if line_breaks:
                clean_line += '\n'
            else:
                clean_line += ' '

            # check for empty line
            if clean_line != '\n':
                output += clean_line
    return output

In [3]:
py = open('example.py', 'r')
code = py.read()
py.close()

ast_tree = ast.parse(code)

for node in ast.walk(ast_tree):
    if isinstance(node, ast.FunctionDef):
        print(clean_snippet(ast.unparse(node), line_breaks=False))

def has_flag self flagname import tempfile with tempfile namedtemporaryfile w suffix cpp as f f write int main int argc char argv return try self compile f name extra_postargs flagname except exception as exc if type exc __name__ compileerror raise return false return true 
def update_matplotlibrc path template_lines path read_text encoding utf splitlines true backend_line_idx idx for idx line in enumerate template_lines if template_lines backend_line_idx path write_text join template_lines encoding utf 
def finalize_options self cppflags os getenv cppflags if cppflags and coverage in cppflags self build_temp build self distribution ext_modules ext for package in good_packages for ext in package get_extensions super finalize_options 
def add_optimization_flags self env os environ copy if sys platform win return env enable_lto setupext config getboolean libs enable_lto fallback none  def prepare_flags name enable_lto if name in os environ if fno lto in os environ name if enable_lto is t

Create a list of popular Python packages:
- matplotlib
- sklearn
- numpy
- pandas
- django
- scipy
- flask
- requests


In [4]:
#"""
repo_urls = ['https://github.com/matplotlib/matplotlib.git',
             'https://github.com/scikit-learn/scikit-learn.git',
             'https://github.com/numpy/numpy.git',
             'https://github.com/pandas-dev/pandas.git',
             'https://github.com/django/django.git',
             'https://github.com/scipy/scipy.git',
             'https://github.com/pallets/flask.git',
             'https://github.com/psf/requests.git']
#"""

#repo_urls = ['https://github.com/numpy/numpy.git']

Define a subclass of GitPython's RemoteProgress to track progress when fetching sources:

In [5]:
class ProgressBar(git.RemoteProgress):

    def __init__(self) -> None:
        super().__init__()

        # create tqdm object
        self.bar = tqdm.tqdm()

        # record start time
        self.start_time = time.time()
        
        # create status_printer function
        self.printer = self.bar.status_printer(sys.stdout)

    # update gets called when git fetch progress changes
    def update(self, op_code, cur_count, max_count=None, message=""):

        elapsed_time = time.time() - self.start_time
        
        progress = self.bar.format_meter(n=cur_count, total=max_count, elapsed=elapsed_time)
        
        self.printer(progress)

Define a function to read a python file and
1. count the occurences of identifier names using ASTs
2. generate a text file to train Word2Vec on

The function removes/ignores line and block comments so that the output more closely represents the logical structure of the code

In [6]:
def scrape_py(py, out_file, ast_dict, ast_total, line_breaks=True):
    '''
    takes two filestreams
    '''

    token_re = re.compile(r'[a-z_]+')
    line_comment_re = re.compile(r'[ \t]*#.*')
    block_comment_re = re.compile(r'(\'\'\')|(\"\"\")')

    code = py.read()
    try:
        ast_tree = ast.parse(code)

        for node in ast.walk(ast_tree):
            
            if isinstance(node, ast.Name):
                ast_total += 1
                node_id = node.id.lower()
                if node_id in ast_dict:
                    ast_dict[node_id] += 1
                else:
                    ast_dict[node_id] = 1
            
            elif isinstance(node, ast.FunctionDef):
                out_file.write(clean_snippet(ast.unparse(node), line_breaks) + '\n')

    except SyntaxError:
        pass
    
    return ast_total

Loop over the list of repositories. For each one, create an empty local git repository, clone the remote repository into the local one, then scrape all the python files:

In [7]:
training = open('python_line_breaks.txt', 'w')
training_no_line_breaks = open('python_no_breaks.txt', 'w')
identifiers = {}
id_total = 0

# loop over repository urls
for i in range(len(repo_urls)):

    # create empty repository to clone into
    temp_repo = git.Repo.init(os.path.join(os.getcwd(), 'temp'))

    # add repo url as remote origin
    origin = temp_repo.create_remote('origin', repo_urls[i])

    # fetch remote objects
    print('Fetching ' + repo_urls[i] + '...')
    origin.fetch(progress=ProgressBar())
    
    # set up local branch to track remote
    temp_repo.create_head('main', origin.refs.main)
    temp_repo.heads.main.set_tracking_branch(origin.refs.main)
    
    # checkout local branch and pull
    temp_repo.heads.main.checkout()
    origin.pull()

    # loop over subdirectories repo directory
    for root, dirs, files in os.walk('temp'):
        
        # loop over files in subdirectory
        for file in files:

            name_ext = file.split('.')

            # ignore files with no extension
            if len(name_ext)==1:
                pass
            
            # check for '.py' files
            elif name_ext[1] == 'py':

                py = open(os.path.join(root, file), 'r')

                id_total = scrape_py(py, training, identifiers, id_total, line_breaks=True)
                py.seek(0)
                scrape_py(py, training_no_line_breaks, identifiers, id_total, line_breaks=False)

                py.close()

    # delete repository
    shutil.rmtree('temp')

training.close()
training_no_line_breaks.close()

Fetching https://github.com/matplotlib/matplotlib.git...


0it [00:00, ?it/s]

100%|██████████| 211860.0/211860.0 [00:43<00:00, 4834.66it/s]

0it [00:44, ?it/s]


Fetching https://github.com/scikit-learn/scikit-learn.git...


0it [00:00, ?it/s]

100%|██████████| 174071.0/174071.0 [01:46<00:00, 1629.74it/s]

0it [01:47, ?it/s]


Fetching https://github.com/numpy/numpy.git...


0it [00:00, ?it/s]

100%|██████████| 187608.0/187608.0 [01:30<00:00, 2078.25it/s]

0it [01:31, ?it/s]


Fetching https://github.com/pandas-dev/pandas.git...


0it [00:00, ?it/s]

100%|██████████| 290126.0/290126.0 [04:05<00:00, 1183.57it/s]

0it [04:06, ?it/s]


Fetching https://github.com/django/django.git...


0it [00:00, ?it/s]

100%|██████████| 352058.0/352058.0 [01:30<00:00, 3908.40it/s]

0it [01:31, ?it/s]


Fetching https://github.com/scipy/scipy.git...


0it [00:00, ?it/s]

100%|██████████| 165584.0/165584.0 [00:34<00:00, 4793.12it/s]

0it [00:35, ?it/s]


Fetching https://github.com/pallets/flask.git...


0it [00:00, ?it/s]

100%|██████████| 15716.0/15716.0 [00:01<00:00, 9718.59it/s] 

0it [00:01, ?it/s]


Fetching https://github.com/psf/requests.git...


0it [00:00, ?it/s]

100%|██████████| 16558.0/16558.0 [00:01<00:00, 9373.60it/s] 

0it [00:01, ?it/s]


Output the identifier name data as a CSV file:

In [8]:
arr = np.asarray(list(identifiers.items()))
count = np.array(arr[:,1], dtype=int).reshape((arr.shape[0], 1))
rate = count/id_total


df = pd.DataFrame()
df['identifier'] = arr[:,0]
df['total'] = count
df['rate'] = rate

df.sort_values('total', ascending=False, inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.to_csv('identifier_frequency.csv')