In [67]:
from transformers import (
    BartTokenizerFast,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
)
import tensorflow as tf
from datasets import Dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from huggingface_hub import notebook_login
from datasets import concatenate_datasets
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths
from question_answering.keras_callbacks.time_measure_callback import TimeMeasureCallback

In [78]:
df_train, df_val, df_test = core_qa_utils.load_datasets_from_csv(
    generative_qa_paths.python_dataset_dir
)

df_train = pd.concat([df_train, df_val], ignore_index=True)

train_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_test]
)

In [79]:
df_test

Unnamed: 0,index,questions,answers,code,original_code
0,0,How does the code add a user in the given buck...,as an owner,def add bucket default owner bucket name user ...,def add_bucket_default_owner bucket_name user_...
1,1,Does the code add a user in the given buckets ...,Yes,def add bucket default owner bucket name user ...,def add_bucket_default_owner bucket_name user_...
2,2,Where does the code add a user as an owner ?,in the given buckets default object access con...,def add bucket default owner bucket name user ...,def add_bucket_default_owner bucket_name user_...
3,3,Does the code create a simple sdist tarball at...,Yes,def make trivial sdist dist path setup py setu...,def make_trivial_sdist dist_path setup_py setu...
4,4,How does the code create a simple sdist tarball ?,at dist_path,def make trivial sdist dist path setup py setu...,def make_trivial_sdist dist_path setup_py setu...
...,...,...,...,...,...
2495,2495,What do the sequence contain ?,only bases,def all bases valid seq valid bases ['a' 'A' '...,def all_bases_valid seq valid_bases ['a' 'A' '...
2496,2496,What created this variable ?,the brick,def get brick var return get annotation var Brick,def get_brick var return get_annotation var Brick
2497,2497,What does the code retrieve ?,the brick that created this variable,def get brick var return get annotation var Brick,def get_brick var return get_annotation var Brick
2498,2498,What does approximate joint diagonalization im...,same results as the matlab implementation by p...,def test ajd n times n channels 10 3 seed np r...,def test_ajd n_times n_channels 10 3 seed np r...


In [97]:
df22 = df_test[df_test['questions'].str.startswith('Where')].reset_index(drop=True).drop(columns=['index']).sample(7)
df22

Unnamed: 0,questions,answers,code,original_code
84,Where did a window of length window_length be ...,on data,def check window params data window length if ...,def _check_window_params data window_length if...
24,Where does the url of a backend [ instance ] r...,in the dev_appserver,def get dev url backend instance None return '...,def _get_dev_url backend instance None return ...
30,Where did cursor position set ?,in console,@lazyobjectdef Set Console Cursor Position scc...,@lazyobjectdef SetConsoleCursorPosition sccp c...
35,Where did metadata set ?,on all instances of a host,@utils arg 'host' metavar '<host>' help ' Name...,@utils arg 'host' metavar '<host>' help _ 'Nam...
44,Where is the process name running ?,on node,def is process running node name command ['pid...,def is_process_running node name command ['pid...
105,Where do user disk usage recalculate quickly ?,in postgres,def pgcalc sa session id dryrun False sql calc...,def pgcalc sa_session id dryrun False sql_calc...
78,Where do virtual interfaces list ?,on a server cli example,def virtual interface list provider names **kw...,def virtual_interface_list provider names **kw...


In [96]:
df22.iloc[1]['questions']

'Where does the code create a nvp logical router ?'

In [47]:
questions = df_test['questions']
questions = [question.lower() for question in questions]
questions

['how does the code add a user in the given buckets default object access control list ?',
 'does the code add a user in the given buckets default object access control list as an owner ?',
 'where does the code add a user as an owner ?',
 'does the code create a simple sdist tarball at dist_path ?',
 'how does the code create a simple sdist tarball ?',
 'what does the code create at dist_path ?',
 'where does le dirs set ?',
 'does le dirs set in parent_dir ?',
 'what sets in parent_dir ?',
 'does the code get the key for a location in a policy file ?',
 'what does the code get ?',
 'does the code remove an user from an object like ?',
 'how does the code remove an user from an object ?',
 'what does the code ensure ?',
 'when are all associated messagecategories deleted ?',
 'does the code ensure ?',
 'does the code run a command on the operation system ?',
 'what does the code run ?',
 'does none mean ?',
 'what does none mean ?',
 'what returns along an axis ?',
 'does the index of

In [48]:
whats = list(filter(lambda x: x.startswith('what'), questions))
a = len(whats)

In [49]:
# whys = list(filter(lambda x: x.startswith('why'), questions))
# b = len(whys)

In [50]:
wheres = list(filter(lambda x: x.startswith('where'), questions))
c = len(wheres)

In [51]:
hows = list(filter(lambda x: x.startswith('how'), questions))
d = len(hows)

In [52]:
forwhats = list(filter(lambda x: x.startswith('for what'), questions))
e = len(forwhats)

In [53]:
dos = list(filter(lambda x: x.startswith('do') or x.startswith('does') or x.startswith('did') or x.startswith('will') or x.startswith('is') or x.startswith('are')
                  or x.startswith('ca') or x.startswith('can'), questions))
f = len(dos)

In [54]:
rows_with_closed_questions = df_test[
    (df_test['answers'] == 'Yes') | 
    (df_test['answers'] == 'No')].reset_index(drop=True)
closed = [question.lower() for question in rows_with_closed_questions['questions']]
f = len(closed)

In [55]:
whens = list(filter(lambda x: x.startswith('when'), questions))
g = len(whens)

In [56]:
# inwhichs = list(filter(lambda x: x.startswith('in which'), questions))
# h = len(inwhichs)

In [57]:
# byhows = list(filter(lambda x: x.startswith('by how'), questions))
# i = len(byhows)

In [58]:
# tills = list(filter(lambda x: x.startswith('till'), questions))
# j = len(tills)

In [59]:
a, c, d, e, f, g

(1851, 123, 200, 79, 137, 79)

In [60]:
(a + c + d + e + f + g), len(questions)

(2469, 2500)

In [61]:
used = whats + wheres + hows + forwhats + closed + whens

In [62]:
# not_used = list(set(questions).difference(used))
not_used = [item for item in questions if item not in used]
len(not_used)

31

In [63]:
not_used

['in which direction do entry move to another ?',
 'why do get carry using urlencode ?',
 'by how much did yaml string quote ?',
 'till when do this nt effect the text colored_text = color_text colored_text = color_text ?',
 'by how much does a stub module nt alter system state ?',
 'why is this kept in a separate module ?',
 'till when do taskqueue tasks execute ?',
 'in which direction does a sorting function bubble results from the current locale ?',
 'in which direction do changes pull to a local one ?',
 'in which direction does the code reset current head ?',
 'in which direction do all objects start with gl ?',
 'why is this better than get_sum ?',
 'in which direction does the code carve vertical or horizontal seams ?',
 'in which direction does this encode a unicode string ?',
 'in which direction does the code create a track object ?',
 'in which direction do tars dir ?',
 'in which direction did asset key create by calling delete_asset method of assets module ?',
 'who marks

In [20]:
2966+452

3418

In [21]:
seen = set()
repeated = set()
for l in used:
  for i in set(l):
    if i in seen:
      repeated.add(i)
    else:
      seen.add(i)

In [22]:
len(list(repeated))

60