In [None]:
!pip install openreview-py
!pip install pipreqs

In [None]:
%matplotlib inline

import os
import re
import sys
import requests
import openreview
import pandas as pd
import matplotlib.pyplot as plt

from random import choice
from wordcloud import WordCloud
from urllib.parse import urlparse

In [None]:
client = openreview.Client(baseurl="https://openreview.net")

blind_notes = {
    note.id: note
    for note in openreview.tools.iterget_notes(
        client,
        invitation="ICLR.cc/2020/Conference/-/Blind_Submission",
        details="original",
    )
}

all_decision_notes = openreview.tools.iterget_notes(
    client, invitation="ICLR.cc/2020/Conference/Paper.*/-/Decision"
)

accepted_submissions = [
    blind_notes[decision_note.forum]
    for decision_note in all_decision_notes
    if "Accept" in decision_note.content["decision"]
]

len(accepted_submissions)

In [None]:
code_present = 0
code_links = []
for note in accepted_submissions:
    try:
        code_links.append(note.content["code"])
        # print("code found")
        code_present += 1
    except:
        print("Unexpected error:", sys.exc_info()[0])

In [None]:
code_present

In [None]:
urlparse(choice(code_links))

In [None]:
code_links_df = pd.DataFrame({"links": code_links})

In [None]:
code_links_df["domains"] = code_links_df.links.apply(lambda x: urlparse(x)[1])

In [None]:
code_links_df.domains.value_counts()

In [None]:
temp_link = ""


def clean_github_link(link):
    link = link.strip()
    if not link[-4:] == ".git":
        return link + ".git"
    else:
        return link


github_repo_links = (
    code_links_df.loc[code_links_df.domains == "github.com"]
    .links.apply(clean_github_link)
    .values
)

In [None]:
# takes about 24 minutes to download
for link in github_repo_links:
    !git clone $link --depth 1 --quiet

In [None]:
code_links_df.loc[code_links_df.domains == "github.com"].links.apply(
    lambda x: urlparse(x)[2].split("/")[1]
).value_counts().head(10)

In [None]:
root = "."
dirlist = [item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item))]
print(dirlist)

In [None]:
dirlist.remove('.config')
dirlist.remove('sample_data')

In [None]:
len(dirlist)

In [None]:
# takes about 10 minutes to run
for repo in dirlist:
    path = "/content/" + repo
    if os.path.exists(path + "/requirements.txt"):
        pass
    else:
        !pipreqs $path

In [None]:
has_req_cnt = no_req_cnt = 0
for repo in dirlist:
    path = "/content/" + repo
    if os.path.exists(path + "/requirements.txt"):
        has_req_cnt += 1
    else:
        no_req_cnt += 1

In [None]:
has_req_cnt, no_req_cnt

In [None]:
with open("/content/" + dirlist[4] + "/" + "requirements.txt", "r") as f:
    tools = f.readlines()

In [None]:
all_repo_names = []
all_tool_names = []
for repo in dirlist:
    try:
        repo_name = repo
        with open("/content/" + repo + "/" + "requirements.txt", "r") as f:
            tools = f.readlines()
        tool_names = ",".join(tools).lower()

        all_repo_names.append(repo_name)
        all_tool_names.append(tool_names)
    except:
        print("Unexpected error for ", repo, sys.exc_info()[0])

In [None]:
all_tools = pd.DataFrame(
    {"all_repo_names": all_repo_names, "all_tool_names": all_tool_names}
)
all_tools.head()

In [None]:
all_tools.to_csv("all_tools.csv", index=False)

In [None]:
all_tools = pd.read_csv("all_tools.csv")

In [None]:
all_tools.head()

In [None]:
all_tools.shape

In [None]:
def cleaner(tool_list):
    cleaned_list = ""
    try:
        cleaned_list = []
        for tool in tool_list:
            cleaned_tool = re.findall("^\w+", tool)
            if not cleaned_tool:
                pass
            else:
                cleaned_list.append(cleaned_tool[0])
        cleaned_list = ",".join(cleaned_list)
        return cleaned_list
    except:
        tool_list = ",".join(tool_list)
        "unclean_list".join(tool_list)
        return tool_list


all_tools["all_tool_names_cleaned"] = all_tools.all_tool_names.str.split(",").apply(
    cleaner
)

In [None]:
all_tools.head()

In [None]:
all_tools.all_tool_names_cleaned.str.contains("torch").sum()

In [None]:
def give_score(tool_name, offset=0):
    num = all_tools.all_tool_names_cleaned.str.contains(tool_name).sum()
    num += offset
    print(
        "Count of {} is {} and total usage is {}%".format(
            tool_name, num, round((num / all_tools.shape[0]) * 100, 4)
        )
    )

In [None]:
give_score("torch")
print()
give_score("tensorflow", offset=12)
print()
give_score("keras")

In [None]:
give_score("transformers")

In [None]:
give_score("tensorboard")

In [None]:
give_score("gym")

In [None]:
give_score("networkx")

In [None]:
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().unique().shape

In [None]:
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().value_counts()[:50]

In [None]:
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().value_counts()[
    :10
].plot(kind="bar")

In [None]:
all_tool_string = ",".join(all_tools.all_tool_names_cleaned)

wordcloud = WordCloud(background_color="white", max_words=100)
wordcloud.generate(all_tool_string)

plt.figure(figsize=(10, 20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()