In [1]:
import pandas as pd 
import requests
import re
import math
from collections import Counter
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
######################## HELPER FUNCTIONS ##################################

def text_to_vector(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys() & vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def possible_replacement_api(missing_api:str, answer_body: str) -> str:
    # done via cosine similarity method
    soup = BeautifulSoup(answer_body, "html.parser")
    candidate_apis = [link.get_text() for link in soup.select("code")]
    missing_api_vector = text_to_vector(missing_api)
    cosine_dict = {}
    if len(candidate_apis) > 1:
        for candidate in candidate_apis:
            candidate_vector = text_to_vector(candidate)
            cosine_similarity = get_cosine(missing_api_vector, candidate_vector)
            cosine_dict[candidate] = cosine_similarity
        return min(cosine_dict, key=cosine_dict.get)
    elif len(candidate_apis) == 1:
        return candidate_apis[0]
    

In [4]:
def getStackQuestions(missing_api:str) -> pd.DataFrame:
    questionQueryURL = "https://api.stackexchange.com/2.3/search/advanced?"
    answerQueryURL = "https://api.stackexchange.com/2.3/questions/{}/answers?"
    questionfilter_dict = {
        "body" : missing_api,
        "sort": "votes",
        "order": "desc",
        "site": "stackoverflow",
        "filter" : "!0WAfAKLVhyg2Bjytoa)ZVCaM5"
    }
    questionAnswersFilter = {
        "order": "desc",
        "sort": "votes",
        "site": "stackoverflow",
        "filter": "!3ubsrEfVBpYHFpKQ5"
    }
    for key, value in questionfilter_dict.items():
        questionQueryURL += "{}={}&".format(key, value)
    ## slice the last element away
    questionQueryURL = questionQueryURL[:-1]
    # for the answers portion
    for key, value in questionAnswersFilter.items():
        answerQueryURL += "{}={}&".format(key, value)
    answerQueryURL = answerQueryURL[:-1]
    # get request
    r = requests.get(questionQueryURL)
    data = r.json()
    questionArr = data["items"]
    # filter away the not answered question
    filteredArr = []
    if len(questionArr) == 0:
        return []
    
    # accept top 1 answers
    for question in questionArr:
        if question["is_answered"]:
            answerURL = answerQueryURL.format(question["question_id"])
            response = requests.get(answerURL)
            answerData = response.json()["items"][0]
            question["answer_score"] = answerData["score"]
            question["answer_body"] = answerData["body"]
            filteredArr.append(question)
    
    df = pd.json_normalize(filteredArr)
    df["possible_replacement"] = df["answer_body"].apply(lambda x: possible_replacement_api(missing_api,x))
    return df

get 5 replacements

In [5]:
df = getStackQuestions("sklearn.externals.joblib")
df

Unnamed: 0,is_answered,view_count,answer_count,score,last_activity_date,creation_date,question_id,link,title,body,answer_score,answer_body,accepted_answer_id,possible_replacement
0,True,16215,4,6,1645303395,1557755770,56113916,https://stackoverflow.com/questions/56113916/c...,Cannot import Sklearn from sklearn.externals.j...,<p>I am a beginner and I just started with mac...,10,<p>I had the same problem.\nI have replaced</p...,,import joblib
1,True,753,1,3,1638282046,1638242427,70163883,https://stackoverflow.com/questions/70163883/g...,Google Colab ModuleNotFoundError: No module na...,<p>My Initial import looks like this and this ...,5,<p>For the second part you can do this to fix ...,70163925.0,# Libraries to help with reading and manipulat...
2,True,545,1,0,1531388579,1524115667,49913330,https://stackoverflow.com/questions/49913330/e...,Export KNN best estimator from GridSearchCV to...,<p>I tried to save <strong>KNN</strong> model ...,1,<p>According to documentation:</p>\n\n<pre><co...,,"n_neighbors : int, optional (default = 5)\n\n ..."


Test out on sklearn data

In [6]:
scipy_df = pd.read_csv("Labeling - scipy.csv")
scipy_df

Unnamed: 0,deprecated API,replacement API,official documentation (web)?,available in rel notes,REPLACEMENT_FOUND,CORRECT replacement ?,Proposed replacement,official documentation (code)?,Stack Overflow?,"Remarks (why missing, anything interesting,What if there is more than 1 replacement?)",To label
0,scipy.rand,numpy.random.rand,0,1.0,1.0,0.0,['scipy.fft'],1.0,0,function still exist in newest ver. just depre...,
1,scipy.diag,numpy.diag,0.5? placeholder,1.0,1.0,0.0,['scipy.fft'],1.0,0,function still exist in newest ver. just depre...,
2,scipy.interpolate.splmake,,0,0.0,1.0,0.0,['splmake/spleval'],1.0,0,no replacement is mentioned in the official do...,
3,scipy.interpolate.spltopp,,0,0.0,0.0,0.0,,1.0,0,"stackoverflow: spltopp, 1 result, no replaceme...",
4,scipy.interpolate.spleval,,0,0.0,0.0,0.0,,1.0,0,"stackoverflow: spleval, 13 result, no replacem...",
...,...,...,...,...,...,...,...,...,...,...,...
102,scipy.linalg.expm2,scipy.linalg.expm,,1.0,1.0,1.0,['scipy.linalg.expm'],0.0,,,
103,scipy.linalg.expm3,scipy.linalg.expm,,1.0,1.0,1.0,['scipy.linalg.expm'],0.0,,,
104,scipy.stats.oneway,scipy.stats.f_oneway,,1.0,1.0,1.0,['scipy.stats.f_oneway'],0.0,,,
105,scipy.stats.glm,scipy.stats.ttest_ind,,1.0,1.0,1.0,['scipy.stats.ttest_ind'],0.0,,,


In [7]:
scipy_depre_apis = scipy_df["deprecated API"].to_list()
scipy_depre_apis

['scipy.rand',
 'scipy.diag',
 'scipy.interpolate.splmake',
 'scipy.interpolate.spltopp',
 'scipy.interpolate.spleval',
 'scipy.interpolate.spline',
 'scipy.interpolate.interpolate_wrapper',
 'scipy.misc.bytescale',
 'scipy.misc.fromimage',
 'scipy.misc.imfilter',
 'scipy.misc.imread',
 'scipy.misc.imresize',
 'scipy.misc.imrotate',
 'scipy.misc.imsave',
 'scipy.misc.imshow',
 'scipy.misc.toimage',
 'scipy.misc.comb',
 'scipy.misc.factorial',
 'scipy.misc.factorial2',
 'scipy.misc.factorialk',
 'scipy.misc.logsumexp',
 'scipy.misc.pade',
 'scipy.misc.info',
 'scipy.misc.source',
 'scipy.misc.who',
 'scipy.special.hyp2f0',
 'scipy.special.hyp1f2',
 'scipy.special.hyp3f0',
 'scipy.signal.windows.slepian',
 'scipy.stats.itemfreq',
 'scipy.spatial.distance.matching',
 'scipy.spatial.distance.wminkowski',
 'scipy.special.errprint',
 'scipy.special.sph_in',
 'scipy.special.sph_jn',
 'scipy.special.sph_kn',
 'scipy.special.sph_yn',
 'scipy.special.sph_jnyn',
 'scipy.special.sph_inkn',
 'scipy

In [8]:
# filter out parameter and function deprecated apis
scipy_depre_function_apis = []
scipy_depre_param_apis = []
for api in scipy_depre_apis:
    if ":" in api:
        scipy_depre_param_apis.append(api)
    else:
        scipy_depre_function_apis.append(api)
print("Length of function deprec api:", len(scipy_depre_function_apis))
print("Length of param depre api:", len(scipy_depre_param_apis))

Length of function deprec api: 87
Length of param depre api: 20


In [9]:
def getStackQuestionsv2(missing_api:str, top_only:bool) -> pd.DataFrame:
    questionQueryURL = "https://api.stackexchange.com/2.3/search/advanced?"
    answerQueryURL = "https://api.stackexchange.com/2.3/questions/{}/answers?"
    questionfilter_dict = {
        "body" : missing_api,
        "sort": "votes",
        "order": "desc",
        "site": "stackoverflow",
        "filter" : "!0WAfAKLVhyg2Bjytoa)ZVCaM5"
    }
    questionAnswersFilter = {
        "order": "desc",
        "sort": "votes",
        "site": "stackoverflow",
        "filter": "!3ubsrEfVBpYHFpKQ5"
    }
    for key, value in questionfilter_dict.items():
        questionQueryURL += "{}={}&".format(key, value)
    ## slice the last element away
    questionQueryURL = questionQueryURL[:-1]
    # for the answers portion
    for key, value in questionAnswersFilter.items():
        answerQueryURL += "{}={}&".format(key, value)
    answerQueryURL = answerQueryURL[:-1]
    # get request
    r = requests.get(questionQueryURL)
    data = r.json()
    questionArr = data["items"]
    # filter away the not answered question
    filteredArr = []
    # return empty string if there is no related question found
    if len(questionArr) == 0:
        return ""
    else:
        # accept top 1 answers
        for question in tqdm(questionArr):
            if question["is_answered"]:
                answerURL = answerQueryURL.format(question["question_id"])
                response = requests.get(answerURL)
                if len(response.json()["items"]) == 0:
                    continue
                answerData = response.json()["items"][0]
                question["answer_score"] = answerData["score"]
                question["answer_body"] = answerData["body"]
                filteredArr.append(question)
    
        df = pd.json_normalize(filteredArr)
        df["possible_replacement"] = df["answer_body"].apply(lambda x: possible_replacement_api(missing_api,x))
        if top_only:
            top_candidate = list(df["possible_replacement"])[0]
            return top_candidate
        else:
            return df

In [10]:
getStackQuestionsv2("sklearn.externals.joblib", True)

100%|██████████| 6/6 [00:03<00:00,  1.95it/s]


'import joblib'

In [45]:
# intialise replacement dictionary
scipy_replacement_dict = {}
for api in scipy_depre_function_apis:
    scipy_replacement_dict[api] = getStackQuestionsv2(api, True)
scipy_replacement_dict

100%|██████████| 4/4 [00:03<00:00,  1.28it/s]
100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 16/16 [00:12<00:00,  1.27it/s]
100%|██████████| 6/6 [00:04<00:00,  1.44it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


KeyError: 'answer_body'

In [46]:
scipy_replacement_dict

{'scipy.rand': '',
 'scipy.diag': '',
 'scipy.interpolate.splmake': '',
 'scipy.interpolate.spltopp': '',
 'scipy.interpolate.spleval': '',
 'scipy.interpolate.spline': 'import shapely.geometry as shgeo\nline = vstack( (x,y) ).T\nline = shgeo.LineString( line )\nsurrounding_polygon = line.buffer( 10,cap_style=3 ) # 10=Dist\n',
 'scipy.interpolate.interpolate_wrapper': '',
 'scipy.misc.bytescale': '',
 'scipy.misc.fromimage': 'cdist',
 'scipy.misc.imfilter': '',
 'scipy.misc.imread': 'multiprocessing.Pool',
 'scipy.misc.imresize': 'import skimage\ndata_new = skimage.transform.resize(data_old, [new_shape_x, new_shape_z], order = 0)\n'}