In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
from collections import Counter

tqdm.pandas()

## Definitions

1. [ ] Define input and output
 - Input: sequence of code tokens, where token to predict is <UNK>
 - Output: a list of predicted tokens, sorted in descending order of probability
1. [ ] Define evaluation metric
 - Should be rank-aware, e.g. MRR, MAP, NDCG.
 - The target token is split into subtokens and the overlap between the predicted token and the target token at subtoken level is evaluated. This can be done with an F1 score.
    - For example, if `transformSearchResponse` is the target token, its subtokens are `transform`, `search` and `response`. If the predicted token is `modifySearchResponse`, then the overlap is 2 subtokens out of 3.
    - [ ] Should we account for the order of the subtokens? Most probably, yes.
    ```
    Precision = TP / (TP + FP) = #overlapping-predicted / (#overlapping-predicted + #nonoverlapping-predicted)
    Recall = TP / (TP + FN) = #overlapping-predicted / (#overlapping-predicted + #nonoverlapping-required)
    F1 = 2 * P * R / (P + R)
    ```
 - The F1 approach is inspired by SQuAD and "Suggesting accurate method and class names" by Allamanis et al.

## Execution Tasks

1. [ ] Gather Data
1. [ ] Analyze Data
1. [ ] Implement an algorithm
1. [ ] Create an evaluation loop
1. [ ] Expose parameters of the algorithm
1. [ ] Make experiments
1. [ ] Document the experiments - Hypothesis, Data, Setup, Evaluation, Algorithm, Experiments, Conclusion, Further Steps



## Discussion
it isn't as simple as, jsut average some word embeddings
it is easy to average the embeddings if you need to predict a word
but i have to predict a an unknown amount of subtokens

well, why don't we try to predict subtoken at a time
but what type should it be?
well, we can split all tokens into subtokens and count the PoS occurrences
so that we know a few patterns upfront
and use these patterns to fill in the subtokens with averaged word embeddings
similar to the context we have, but filtered according to the PoS tag
this is an interesting idea, because we do suggestion at the subtoken level
i like this idea and we can try it

of course the other idea is to use source code embeddings directly
but they have to be learned on our source code base
and this is a separate problem
which involves learning a model
which is what we try to avoid with our simplistic baseline
we are just exploring ideas
a baseline without learning might be a failure
but the ideas we try and pick up during the design and experimentation are what is most valuable



In [14]:
def split_by_camel_case(token):
#     TODO: implement me
    return token

def get_subtokens(token):
    return split_by_camel_case(token)

def compute_f1(target_token, predicted_token):
    target_subtokens = get_subtokens(target_token)
    predicted_subtokens = get_subtokens(predicted_token)
    overlapping = Counter(target_subtokens) & Counter(predicted_subtokens)
    overlapping_count = sum(overlapping.values())
    
    precision = 1.0 * overlapping_count / len(predicted_subtokens)
    recall = 1.0 * overlapping_count / len(target_subtokens)
    f1 = (2.0 * precision * recall) / (precision + recall)
    return f1

In [15]:
compute_f1(['transform', 'search', 'response'], ['modify', 'search', 'response', 'data'])

0.5714285714285715

In [64]:
df = pd.read_csv('../data/method-names/elastic-search-clean.csv', delimiter=';')
df

Unnamed: 0,file,id,type
0,/buildSrc/src/test/java/org/elasticsearch/grad...,testInvalidBlockQuote,void
1,/buildSrc/src/test/java/org/elasticsearch/grad...,testSimpleBlockQuote,void
2,/buildSrc/src/test/java/org/elasticsearch/grad...,testMultipleBlockQuotes,void
3,/buildSrc/src/test/java/org/elasticsearch/grad...,testEscapingInBlockQuote,void
4,/buildSrc/src/test/java/org/elasticsearch/grad...,testIsDocWriteRequest,void
...,...,...,...
115630,/plugins/repository-s3/src/main/java/org/elast...,refine,S3ClientSettings
115631,/plugins/repository-s3/src/main/java/org/elast...,load,"Map<String, S3ClientSettings>"
115632,/plugins/repository-s3/src/main/java/org/elast...,loadCredentials,S3BasicCredentials
115633,/plugins/repository-s3/src/main/java/org/elast...,getClientSettings,S3ClientSettings


In [72]:
def camel_case_split(str):
    words = [[str[0]]]
  
    for c in str[1:]: 
        if words[-1][-1].islower() and c.isupper(): 
            words.append(list(c)) 
        else: 
            words[-1].append(c) 
  
    return [''.join(word) for word in words] 

In [68]:
'asdfSDA'.lower()

'asdfsda'

In [75]:
df_id = df['id'] \
    .apply(camel_case_split) \
    .apply(lambda identifier: [word.lower() for word in identifier]) \

df_id

0              [test, invalid, block, quote]
1               [test, simple, block, quote]
2            [test, multiple, block, quotes]
3         [test, escaping, in, block, quote]
4            [test, is, doc, write, request]
                         ...                
115630                              [refine]
115631                                [load]
115632                   [load, credentials]
115633               [get, client, settings]
115634                              [equals]
Name: id, Length: 115635, dtype: object

In [31]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [79]:
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [89]:
df_id.head(10).progress_apply(lambda parts: list(map(lambda token: (token.text, token.tag_) ,nlp(' '.join(parts)))))

100%|██████████| 10/10 [00:00<00:00, 122.97it/s]


0    [(test, NNP), (invalid, JJ), (block, NN), (quo...
1    [(test, NNP), (simple, JJ), (block, NN), (quot...
2    [(test, NN), (multiple, JJ), (block, NN), (quo...
3    [(test, NN), (escaping, VBG), (in, IN), (block...
4    [(test, NN), (is, VBZ), (doc, VBN), (write, NN...
5              [(test, NN), (match, NN), (source, NN)]
6    [(test, NNP), (parse, NNP), (os, NNP), (releas...
7    [(test, NNP), (remove, VB), (trailing, VBG), (...
8          [(test, NN), (remove, VB), (comments, NNS)]
9    [(test, NN), (derive, VB), (i, PRP), (d, MD), ...
Name: id, dtype: object