In [24]:
import pandas as pd 
import os
import requests
import re
import math
from collections import Counter
from bs4 import BeautifulSoup
from typing import List

In [42]:
######################## HELPER FUNCTIONS ##################

def text_to_vector(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys() & vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def possible_replacement_api(missing_api:str, answer_body: str) -> str:
    soup = BeautifulSoup(answer_body, "html.parser")
    candidate_apis = [link.get_text() for link in soup.select("code")]
    missing_api_vector = text_to_vector(missing_api)
    cosine_dict = {}
    if len(candidate_apis) > 1:
        for candidate in candidate_apis:
            candidate_vector = text_to_vector(candidate)
            cosine_similarity = get_cosine(missing_api_vector, candidate_vector)
            cosine_dict[candidate] = cosine_similarity
        return min(cosine_dict, key=cosine_dict.get)
    else:
        return candidate_apis[0]

In [43]:
def getStackQuestions(missing_api:str) -> pd.DataFrame:
    questionQueryURL = "https://api.stackexchange.com/2.3/search/advanced?"
    answerQueryURL = "https://api.stackexchange.com/2.3/questions/{}/answers?"
    questionfilter_dict = {
        "body" : missing_api,
        "sort": "votes",
        "order": "desc",
        "site": "stackoverflow",
        "filter" : "!0WAfAKLVhyg2Bjytoa)ZVCaM5"
    }
    questionAnswersFilter = {
        "order": "desc",
        "sort": "votes",
        "site": "stackoverflow",
        "filter": "!3ubsrEfVBpYHFpKQ5"
    }
    for key, value in questionfilter_dict.items():
        questionQueryURL += "{}={}&".format(key, value)
    ## slice the last element away
    questionQueryURL = questionQueryURL[:-1]
    # for the answers portion
    for key, value in questionAnswersFilter.items():
        answerQueryURL += "{}={}&".format(key, value)
    answerQueryURL = answerQueryURL[:-1]
    # get request
    r = requests.get(questionQueryURL)
    data = r.json()
    questionArr = data["items"]
    # filter away the not answered question
    filteredArr = []
    
    # accept top 1 answers
    for question in questionArr:
        if question["is_answered"]:
            answerURL = answerQueryURL.format(question["question_id"])
            response = requests.get(answerURL)
            answerData = response.json()["items"][0]
            question["answer_score"] = answerData["score"]
            question["answer_body"] = answerData["body"]
            filteredArr.append(question)
    
    df = pd.json_normalize(filteredArr)
    df["possible_replacement"] = df["answer_body"].apply(lambda x: possible_replacement_api(missing_api,x))
    return df

In [44]:
df = getStackQuestions("sklearn.externals.joblib")

In [45]:
df

Unnamed: 0,is_answered,view_count,answer_count,score,last_activity_date,creation_date,question_id,link,title,body,answer_score,answer_body,accepted_answer_id,possible_replacement
0,True,16175,4,6,1645303395,1557755770,56113916,https://stackoverflow.com/questions/56113916/c...,Cannot import Sklearn from sklearn.externals.j...,<p>I am a beginner and I just started with mac...,10,<p>I had the same problem.\nI have replaced</p...,,import joblib
1,True,726,1,3,1638282046,1638242427,70163883,https://stackoverflow.com/questions/70163883/g...,Google Colab ModuleNotFoundError: No module na...,<p>My Initial import looks like this and this ...,5,<p>For the second part you can do this to fix ...,70163925.0,# Libraries to help with reading and manipulat...
2,True,545,1,0,1531388579,1524115667,49913330,https://stackoverflow.com/questions/49913330/e...,Export KNN best estimator from GridSearchCV to...,<p>I tried to save <strong>KNN</strong> model ...,1,<p>According to documentation:</p>\n\n<pre><co...,,"n_neighbors : int, optional (default = 5)\n\n ..."
