In [2]:
import os
import re
import ast
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
def get_api_path(api_full_path_name):
    lib_name = api_full_path_name.split('.')[0].strip()
    lib_path = f'data/{lib_name}/'
    folders = [x for x in os.listdir(lib_path) if 
               not x.endswith('.tar.gz') and not x.endswith('zip') and x.startswith(lib_name)]
    #remove leading 'lib_name' from folder path
    api_path = '/'.join(api_full_path_name.split('.')[:-1])
    api_name = api_full_path_name.split('.')[-1]
    
    # list all ver_path with api_path
    dir_paths = [x[0] for x in os.walk(os.path.join(lib_path)) if x[0].endswith(api_path)]

    # find py files that defines the api
    api_path = []
    for path in dir_paths:
        grep_result = !grep -r 'def {api_name}' {path}/*
        api_path.extend([x.split(':')[0] for x in grep_result])
    
#     if unable to find definition of function, look for funtion by full name 
    if not api_path:
        grep_result = !grep -r '{api_full_path_name}' {path}/*
        api_path.extend([x.split(':')[0] for x in grep_result])
        
    # try searching without the lib name    
    if not api_path:
        partial_name = '.'.join(api_full_path_name.split('.')[1:])
        grep_result = !grep -r '{partial_name}' {path}/*
        api_path.extend([x.split(':')[0] for x in grep_result])
    
    # global search. file_path do not need to end with the api_path
    if not api_path:
        for path in folders:
            grep_result = !grep -r 'def {api_name}' {path}/*
            api_path.extend([x.split(':')[0] for x in grep_result])
        
    return api_path, api_name



In [4]:
# look into each of them for deprecation 

def parse_deprecation_declarator(api_path, api_name):
    deprecation_declarator_messages = []
    for path in api_path:
        try:
            code = open(path, 'r').read()
        except:
            continue
        code = code.replace('\n', '').replace('\r', '')
        
        messages = re.findall('@*deprecate*\((.*?)\)', code)
        if not messages:
            messages = re.findall('.warn*\((.*?)\)', code)
        deprecation_declarator_messages.extend(messages)
        
    api_name_hits = [x for x in deprecation_declarator_messages if api_name in x]
    if not api_name_hits:
        return set(deprecation_declarator_messages)
    return set(api_name_hits)

def test_declarator_parse(api_full_path_name):
    api_path, api_name = get_api_path(api_full_path_name)
    return parse_deprecation_declarator(api_path, api_name)

test_declarator_parse('numpy.ma.MaskedArray.mini')

set()

In [5]:
def select_replacements_from_messages(api_name, messages):
    replacements = set()
    api_name = api_name.lower()
    messages = [ ' '.join(x.lower().replace('"','').split()) for x in messages]
    #using double quotes
    for message in messages:
        if api_name in message:
#             for lib in re.findall('`(.*?)`',message):
#                 replacements.add(lib)
            for lib in re.findall('``(.*?)``',message): 
                replacements.add(lib)
    
    #pattern: use <> instead
    if not replacements:
        for message in messages:
            if api_name in message:
                for lib in re.findall('use (.*?) instead',message.replace('`', '')): 
                    replacements.add(lib)
    
    
    #pattern: "use instead <> " 
    if not replacements:
        for message in messages:
            if api_name in message:
                for lib in re.findall('use instead (.*) ',message.replace('`', '')): 
                    replacements.add(lib)
                    
    #pattern: "use <> " 
    if not replacements:
        for message in messages:
            if api_name in message:
                for lib in re.findall('use (.*?) ',message.replace('`', '')): 
                    replacements.add(lib)
                    
    #pattern: use <> <end>
    if not replacements:
        for message in messages:
            if api_name in message:
                for lib in re.findall('use (.*)',message.replace('`', '')): 
                    replacements.add(lib)
                    
    #pattern: in favor of
    if not replacements:
        for message in messages:
            if api_name in message:
                for lib in re.findall('in favor of (.*?) ',message.replace('`', '')): 
                    replacements.add(lib)
                for lib in re.findall('in favor of (.*?).',message.replace('`', '')): 
                    replacements.add(lib)
                    
    #pattern ' <> instead'
    if not replacements:
        for message in messages:
            if api_name in message:
                if ' instead' in message:
                    words = message.replace('`', '').split(' ')
                    for i in range(len(words)):
                        if 'instead' in words[i]:
                            replacements.add(words[i-1])
                            
    #get any dot seperated words names if api_name is mentioned
    if not replacements:
        for message in messages:
            if api_name in message:
                words = message.replace('`', '').split(' ')
                for word in words:
                    if '.' in word.strip('.'):
                        replacements.add(word)
    
    
    return replacements


def test_declarator_select(api_full_path):
    api_name = api_full_path.split('.').pop()
    messages = test_declarator_parse(api_full_path)
    candidates = select_replacements_from_messages(api_name, messages)
    return list(candidates)

In [6]:
lib = 'numpy'
df = pd.read_csv(f'./data/{lib}.csv')

In [7]:
df

Unnamed: 0,deprecated API,replacement API
0,numpy.distutils.exec_command,subprocess.Popen
1,numpy.alen,numpy.len
2,numpy.core.typeNA,numpy.sctypeDict
3,numpy.core.sctypeNA,numpy.sctypeDict
4,numpy.testing.utils,numpy.testing
5,numpy.testing.decorators,numpy.testing
6,numpy.testing.nosetester,numpy.testing
7,numpy.testing.noseclasses,numpy.testing
8,numpy.FloatFormat,numpy.FloatingFormat
9,numpy.LongFloatFormat,numpy.FloatingFormat


In [8]:

df = df[~df['deprecated API'].str.contains(':')]
print('Total number of applicable APIs', len(df))
df_with_replacements = df[~((df['replacement API'].isna()) |
                            (df['replacement API']=='') |
                            (df['replacement API']==' ') |
                            (df['replacement API']=='no') |
                            (df['replacement API'].str.contains(' ')) |
                            (df['replacement API'].str.contains('-')))
                         ]
print('Total number of WITH APIs replacements ', len(df_with_replacements))

df_with_replacements['extracted_text'] = df_with_replacements[
    'deprecated API'].progress_apply(test_declarator_parse)

Total number of applicable APIs 44
Total number of WITH APIs replacements  44


  0%|          | 0/44 [00:00<?, ?it/s]

In [9]:
df_with_replacements['PROPOSED_REPLACEMENT'] = df_with_replacements.progress_apply(
    lambda x: select_replacements_from_messages(x['deprecated API'].split('.').pop(), x.extracted_text)
    , axis=1)
df_with_replacements.to_csv(f'./results/{lib}-results.csv', index=False)

  0%|          | 0/44 [00:00<?, ?it/s]