In [5]:
from git import Repo
import os
import re
import string
import gensim
import ast
from collections import Counter

In [3]:
# specify the list of repository urls to crawl
repository_list = ['https://github.com/matplotlib/matplotlib.git',
                    'https://github.com/scikit-learn/scikit-learn.git',
                    'https://github.com/numpy/numpy.git',
                    'https://github.com/scipy/scipy.git',
                    'https://github.com/pallets/flask.git',
                    'https://github.com/psf/requests.git',
                    'https://github.com/scrapy/scrapy']

# specify the path of the output text file
output_file = "python_files.txt"

# loop through all repository urls
for url in repository_list:
    # clone the repository to a temporary directory
    repo_dir = os.path.join(os.getcwd(), "temp")
    Repo.clone_from(url, repo_dir)

    # loop through all Python files in the repository
    for root, dirs, files in os.walk(repo_dir):
        for file in files:
            if file.endswith(".py"):
                # get the contents of the Python file
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding='utf-8') as f:
                    file_content = f.read()
                # write the contents of the Python file to the output file
                with open(output_file, "a", encoding='utf-8') as f:
                    f.write(f"Repository: {url}\nFile: {file_path}\n\n{file_content}\n\n{'-'*50}\n\n")

    # delete the temporary directory
    os.system(f"rm -rf {repo_dir}")

In [15]:
# Count the number of lines of code in this file
num_lines = 0

with open('python_files.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    num_lines = len(lines)

print(f"Lines: {num_lines}")

Lines: 1439996


In [7]:
# Creates a function that counts the number of words in output file
def tokenize(path):

    token_list = []

    # count the number of tokens in the file
    with open(path, "r", encoding='utf-8') as f:
        file_content = f.read()

    tokens = re.findall(r'\b\w+\b', file_content)
    token_list.append(tokens)
    num_tokens = len(tokens)
    print(f"Number of tokens: {num_tokens}")

    return token_list

tokenized_list = tokenize("python_files.txt")


Number of tokens: 5978486


In [8]:
# Train your Gensim Word2Vec model with the tokenized lines of code
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

model.build_vocab(tokenized_list)

model.train(tokenized_list, total_examples=model.corpus_count, epochs=model.epochs)


(50000, 29892430)

In [9]:
#Explored the trained model by examining the closest_words to “for”
model.wv.most_similar('for')

[('to', 0.9962059855461121),
 ('not', 0.9961488842964172),
 ('f', 0.996028482913971),
 ('in', 0.9957643151283264),
 ('the', 0.9957113862037659),
 ('name', 0.9956578612327576),
 ('return', 0.9954971075057983),
 ('if', 0.9953508377075195),
 ('sphinx', 0.9949526190757751),
 ('this', 0.9948042035102844)]

In [10]:
#Explored the trained model by examining the closest_words to “if”
model.wv.most_similar('if')

[('in', 0.9966590404510498),
 ('to', 0.9966531991958618),
 ('f', 0.9965929388999939),
 ('return', 0.9965319633483887),
 ('app', 0.9964976906776428),
 ('not', 0.9964142441749573),
 ('name', 0.9961705207824707),
 ('the', 0.9958152174949646),
 ('path', 0.9957109689712524),
 ('as', 0.9956626892089844)]

In [11]:
#Examining the popular identifier names like "math" and "numpy" and the similarity between them
model.wv.similarity('math','numpy')

0.32239005

## Extension

In [12]:
import ast
from collections import Counter

# specify the path of the Python file to be analyzed
file_path = "python_files.txt"

# read the file and get the abstract syntax tree
with open(file_path, "r", encoding='utf-8') as f:
    code = f.read()

# get a list of all identifiers in the ast tree
identifiers = []
for line in code.splitlines():
    try:
        # get the abstract syntax tree of the file
        ast_tree = ast.parse(line)
        for node in ast.walk(ast_tree):
            if isinstance(node, ast.Name):
                identifiers.append(node.id)
    except SyntaxError:
        continue

# count the frequency of each identifier
identifier_counts = Counter(identifiers)

# print the 50 most common identifiers
for identifier, count in identifier_counts.most_common(50):
    print(f"{identifier}: {count}")

np: 4721
plt: 3864
ax: 2673
fig: 1659
X: 1301
y: 1206
x: 1059
axs: 766
def_gen: 709
ax1: 539
random_st: 504
ax2: 462
print: 441
t: 375
A: 316
Y: 283
rng: 280
D_arr_0p5: 271
__all__: 268
X_train: 249
n_samples: 240
Z: 238
len: 226
X_test: 219
D_arr_like_0p5: 217
iris: 215
data: 206
clf: 190
i8: 188
s: 184
re: 180
time: 179
y_train: 175
i: 159
b_: 157
int: 156
__name__: 152
y_test: 142
i4: 138
z: 135
_: 128
xx: 128
N: 128
ax3: 127
float: 127
os: 126
b: 126
a: 120
ax0: 118
c: 116


In [17]:
# get the 50 most common identifiers
most_common_identifiers = [identifier for identifier, count in identifier_counts.most_common(50)]

# calculate the similarity between each pair of the 50 most common identifiers
similarities = {}
for i, identifier1 in enumerate(most_common_identifiers):
    for identifier2 in most_common_identifiers[i+1:]:
        similarity = model.wv.similarity(identifier1, identifier2)
        similarities[(identifier1, identifier2)] = similarity

# sort the pairs of identifiers by their similarity score
sorted_pairs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# print the 10 most similar pairs of identifiers
print("Most similar pairs of identifiers:")
for pair, similarity in sorted_pairs[:10]:
    print(f"{pair[0]} and {pair[1]}: {similarity:.2f}")

# print the 10 most dissimilar pairs of identifiers
print("\nMost dissimilar pairs of identifiers:")
for pair, similarity in sorted_pairs[-10:]:
    print(f"{pair[0]} and {pair[1]}: {similarity:.2f}")


Most similar pairs of identifiers:
os and a: 0.99
print and os: 0.98
print and a: 0.98
t and a: 0.97
t and os: 0.97
data and os: 0.97
data and a: 0.97
print and t: 0.97
print and data: 0.96
t and data: 0.96

Most dissimilar pairs of identifiers:
x and xx: -0.23
X_test and ax0: -0.24
iris and b: -0.24
ax1 and random_st: -0.24
Y and y_train: -0.24
plt and X_test: -0.24
fig and b_: -0.25
X_test and iris: -0.27
A and i4: -0.31
plt and y_train: -0.37
