In [42]:
import logging
from datetime import datetime
from transformers import AutoTokenizer, TFAutoModel, logging as transformers_logging
import tensorflow as tf
import os
import numpy as np
from tqdm import tqdm
from typing import Dict, Literal, List, Iterator
from keras import Model, Sequential, callbacks
from keras.layers import Dense, Input, Concatenate, Dot
from keras.losses import Loss
from keras.utils import losses_utils
from keras.metrics import BinaryAccuracy, Precision, Recall
from keras.optimizers import Adam
from mongo_db_client import MongoDbClient
import more_itertools
import random

In [43]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
transformers_logging.set_verbosity_error()

# Utils

In [None]:
random.seed(42)

def shuffle(list: List) -> List:
  shuffled_list = list.copy()
  random.shuffle(shuffled_list)
  return shuffled_list

## Generate embeddings from sentences

In [None]:
class EmbeddingGenerator:
  def __init__(self, code_embedding_model="microsoft/codebert-base", comment_embedding_model="bert-large-uncased") -> None:
    self.embedding_max_length = 256
    self.comment_embedding_model = comment_embedding_model
    self.code_embedding_model = code_embedding_model

  def from_pairs(self, pairs: List[MongoDbPairDoc], batch_size=100) -> Iterator:
    for batch_pairs in more_itertools.chunked(pairs, batch_size):
      codes = [self.__pre_process_tokens(pair['code_tokens']) for pair in batch_pairs]
      comments = [self.__pre_process_tokens(pair['comment_tokens']) for pair in batch_pairs]

      codes_embeddings = self.from_sentences(
        sentences=codes,
        model=TFAutoModel.from_pretrained(self.code_embedding_model),
        tokenizer=AutoTokenizer.from_pretrained(self.code_embedding_model)
      )
      comments_embeddings = self.from_sentences(
        sentences=comments,
        model=TFAutoModel.from_pretrained(self.comment_embedding_model),
        tokenizer=AutoTokenizer.from_pretrained(self.comment_embedding_model)
      )

      yield (codes_embeddings, comments_embeddings)

  def from_sentences(self, sentences: List[str], tokenizer, model):
      encoded_input = tokenizer(
          sentences, 
          padding='max_length', 
          max_length=self.embedding_max_length,
          truncation=True, 
          return_tensors='tf',
      )
      model_output = model(**encoded_input, return_dict=True)

      embeddings = self.__mean_pooling(model_output, encoded_input['attention_mask'])
      embeddings = tf.math.l2_normalize(embeddings, axis=1)
      return embeddings
  
  def __mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = tf.cast(tf.tile(tf.expand_dims(attention_mask, -1), [1, 1, token_embeddings.shape[-1]]), tf.float32)
    return tf.math.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.math.maximum(tf.math.reduce_sum(input_mask_expanded, 1), 1e-9)
  
  def __pre_process_tokens(self, tokens) -> str:
    parsed = ' '.join(tokens).replace('\n', ' ')
    parsed = ' '.join(parsed.strip().split())
    return parsed

# Create an embedding dataset

In [None]:
class EmbeddingDataset:
  def __init__(self, dataset_dir='../datasets/embeddings/') -> None:
    self.dataset_dir = dataset_dir
    self.embedding_generator = EmbeddingGenerator()

  def save(self, pairs: List[MongoDbPairDoc]):
    stored_pairs_ids = { pair_id.replace('.npy', ''): "" for pair_id in os.listdir(self.dataset_dir) if pair_id.endswith('.npy') }
    pairs_to_save = [pair for pair in pairs if str(pair['_id']) not in stored_pairs_ids]
    pairs_to_save_count = len(pairs_to_save)
    pair_index = 0

    with tqdm(total=pairs_to_save_count, desc=f"Saving {pairs_to_save_count} pairs into embedding dataset") as progress_bar:
      for code_embeddings, comment_embeddings in self.embedding_generator.from_pairs(pairs_to_save):
        for code_embedding, comment_embedding in zip(code_embeddings, comment_embeddings):
          pair = pairs_to_save[pair_index]
          np.save(os.path.join(self.dataset_dir, f'{pair["_id"]}.npy'), [code_embedding.numpy(), comment_embedding.numpy()])

          progress_bar.update(1)
          pair_index += 1

  def get(self, pair_id: str):
    return np.load(os.path.join(self.dataset_dir, f'{pair_id}.npy'), allow_pickle=True)

  def validate(self, pairs: List[MongoDbPairDoc]):
    pairs_len = len(pairs)
    if pairs_len > 100:
      raise ValueError("The pairs length should be <= 100")

    random_index = random.randint(0, pairs_len - 1)
    code_embeddings, comment_embeddings = next(self.embedding_generator.from_pairs(pairs))
    [store_code_emb, store_comment_emb] = self.get(str(pairs[random_index]["_id"]))

    correct_indexes = []
    for index, (code_emb, comment_emb) in enumerate(zip(code_embeddings, comment_embeddings)):
      is_correct = np.array_equal(code_emb, store_code_emb) and np.array_equal(comment_emb, store_comment_emb)
      if is_correct:
        correct_indexes.append(index)
    
    return len(correct_indexes) == 1 and correct_indexes[0] == random_index

In [None]:
train_samples_count = 10000
test_samples_count = 2000
valid_samples_count = 2000
db_client = MongoDbClient()
embedding_dataset = EmbeddingDataset()

In [None]:
def create_tf_dataset(pairs_ids: List[str]) -> tf.data.Dataset:
  def dataset_generator():
    for pair_id in pairs_ids:
      [code_embedding, comment_embedding] = embedding_dataset.get(pair_id)

      yield {
        "code_embedding": code_embedding,
        "comment_embedding": comment_embedding,
      }
  
  return tf.data.Dataset.from_generator(dataset_generator, output_types={
    "code_embedding": tf.float32, 
    "comment_embedding": tf.float32,
  })

In [None]:
embedding_dataset.save(list(db_client.get_pairs_collection().find({ "partition": "train", "language": "python" }).limit(train_samples_count)))
embedding_dataset.save(list(db_client.get_pairs_collection().find({ "partition": "test", "language": "python" }).limit(test_samples_count)))
embedding_dataset.save(list(db_client.get_pairs_collection().find({ "partition": "valid", "language": "python" }).limit(valid_samples_count)))

In [None]:
should_validate_embedding_dataset = False

if should_validate_embedding_dataset:
  is_train_correct = embedding_dataset.validate(list(db_client.get_pairs_collection().find({ "partition": "train", "language": "python" }).limit(10)))
  is_test_correct = embedding_dataset.validate(list(db_client.get_pairs_collection().find({ "partition": "test", "language": "python" }).limit(10)))
  is_valid_correct = embedding_dataset.validate(list(db_client.get_pairs_collection().find({ "partition": "valid", "language": "python" }).limit(10)))
  print(f'is train dataset correct? {is_train_correct}') 
  print(f'is test dataset correct? {is_test_correct}') 
  print(f'is valid dataset correct? {is_valid_correct}') 

# Training the model

In [None]:
NumDenseLayers = Literal[2, 4, 8]
input_shape = (1024) # TODO: Use variables
hidden_layer_activation = 'tanh'
output_activation = 'sigmoid'
dense_layers: Dict[NumDenseLayers, List] = {
  2: [
    Dense(100, activation=hidden_layer_activation),
    Dense(50, activation=hidden_layer_activation),
  ],
  4: [
    Dense(400, activation=hidden_layer_activation),
    Dense(200, activation=hidden_layer_activation),
    Dense(100, activation=hidden_layer_activation),
    Dense(50, activation=hidden_layer_activation),
  ], 
  8: [
    Dense(800, activation=hidden_layer_activation),
    Dense(600, activation=hidden_layer_activation),
    Dense(500, activation=hidden_layer_activation),
    Dense(400, activation=hidden_layer_activation),
    Dense(300, activation=hidden_layer_activation),
    Dense(200, activation=hidden_layer_activation),
    Dense(100, activation=hidden_layer_activation),
    Dense(50, activation=hidden_layer_activation),
  ], 
}
dropout_rate=0.1

In [None]:
class ConstrastiveLoss(Loss):
   def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="constrastive_loss", margin=1):
      self.margin = margin
      super().__init__(reduction, name)

   def call(self, y_true, y_pred):
      square_pred = tf.math.square(y_pred)
      margin_square = tf.math.square(tf.math.maximum(self.margin - (y_pred), 0))
      return tf.math.reduce_mean(
        (1 - y_true) * square_pred + (y_true) * margin_square
      )

def build_model(num_hidden_layers: NumDenseLayers):
  code_input = Input(
    shape=input_shape,
    name="code_embedding",
  )
  comment_input = Input(
    shape=input_shape,
    name="comment_embedding",
  )

  concatenated_inputs = Concatenate()([code_input, comment_input])
  hidden_layers = Sequential(dense_layers[num_hidden_layers], name="hidden_layers")(concatenated_inputs)
  output = Dense(1, activation=output_activation, name="output")(hidden_layers)
  model = Model(
    inputs=[code_input, comment_input],
    outputs=output,
    name="embedding_comparator"
  )

  model.compile(
    optimizer=Adam(),
    loss=ConstrastiveLoss(),
    metrics=[
      BinaryAccuracy(),
      Precision(name="precision"),
      Recall(name="recall"),
      # f1_score, # TODO: Reactivate
    ],
  )

  return model

In [None]:
embedding_comparator = build_model(num_hidden_layers=2)
tensor_board_callback = callbacks.TensorBoard(log_dir=f"../logs/scalars/{datetime.now().strftime('%Y%m%d-%H%M%S')}")

train_pairs = [str(pair['_id']) for pair in list(db_client.get_pairs_collection().find({ "partition": "train", "language": "python" }).limit(train_samples_count))]
valid_pairs = [str(pair['_id']) for pair in list(db_client.get_pairs_collection().find({ "partition": "valid", "language": "python" }).limit(valid_samples_count))]

# def map_to_train(sample, target):
#     return (sample, target)

# positive_pairs = create_tf_dataset(train_pairs).map(lambda sample: map_to_train(sample, 1))
# negative_pairs = positive_pairs.shuffle(buffer_size=int(train_samples_count * 0.4))
# pairs_dataset = tf.data.Dataset.choose_from_datasets([positive_pairs, negative_pairs], [0, 1])
# valid_pairs = create_tf_dataset(valid_pairs).map(lambda sample: map_to_train(sample, 1)).batch(100)

# results = embedding_comparator.fit(
#     pairs_dataset,
#     validation_data=valid_pairs,
#     epochs=10,
#     callbacks=[tensor_board_callback],
# )

In [None]:
def set_sample_target(sample, target: int):
  sample['target'] = target
  return sample

train_dataset = create_tf_dataset(train_pairs).map(lambda sample: set_sample_target(sample, 0))
negative_pairs_ds = train_dataset.map(lambda sample: set_sample_target(sample, 1)).shuffle(buffer_size=int(train_samples_count * 0.2))

is_equal_count = 0
for positive, negative in zip(train_dataset, negative_pairs_ds):
  is_equal = np.array_equal(positive['code_embedding'], negative['code_embedding']) and np.array_equal(positive['comment_embedding'], negative['comment_embedding'])
  is_equal_count += 1 if is_equal else 0

print(f'equal pairs: {is_equal_count}')


In [None]:
x_with_target = x.map(lambda sample: set_sample_target(sample, 0))
neg_x = x.map(lambda sample: set_sample_target(sample, 1))
for positive, negative in zip(x_with_target.take(10), neg_x.take(10)):

    print(f"positive label: {positive['target']} negative target: {negative['target']}" )


In [None]:
import tensorflow as tf

choice_dataset = tf.data.Dataset.range(2).repeat(10).shuffle(5)
dataset = tf.data.Dataset.range(10)
negative_ds = dataset.shuffle(5).map(lambda i: -i)

all_data = tf.data.Dataset.choose_from_datasets([dataset, negative_ds], choice_dataset)

for i in all_data:
  print(i)

In [None]:
def euclidean_distance(vects):
  [x, y] = vects
  sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=0)
  distance = tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))
  return distance

def build_siamese_model():
  code_embedding_input = Input(
    shape=input_shape,
    name="code_embedding",
  )
  comment_embedding_input = Input(
    shape=input_shape,
    name="comment_embedding",
  )
  similarity_score = Dot(normalize=True, axes=1)([code_embedding_input, comment_embedding_input])

  # normal_layer = BatchNormalization()(concatenated_inputs)
  output_layer = Dense(1, activation="sigmoid")(similarity_score)
  model = Model(inputs=[code_embedding_input, comment_embedding_input], outputs=output_layer, name="siamese_model")
  model.compile(
    optimizer=Adam(),
    loss=ConstrastiveLoss(),
    metrics=[
      BinaryAccuracy(),
      Precision(name="precision"),
      Recall(name="recall"),
      # f1_score, # TODO: Reactivate
    ],
  )
  return model

In [None]:
# siamese_model = build_siamese_model()
# siamese_model.fit(
#     pairs_dataset,
#     validation_data=valid_pairs,
#     epochs=10,
#     callbacks=[tensor_board_callback],
# )

In [None]:
# test_pairs = [str(pair['_id']) for pair in list(db_client.get_pairs_collection().find({ "partition": "train", "language": "python" }).limit(test_samples_count))]
# test_dataset = create_tf_dataset(test_pairs, for_training=False).batch(100)

# predicts = siamese_model.predict(test_dataset)
# predicts

In [None]:
import matplotlib.pyplot as plt

def plot_embedding_sample(sample):
  fig, axs = plt.subplots(ncols=2, nrows=1)
  code_axs = axs[0]
  comment_axs = axs[1]

  code_axs.plot(sample['code_embedding'], label="code", color="blue")
  comment_axs.plot(sample['comment_embedding'], label="comment", color="orange")
  fig.legend()
  
  return fig

In [None]:
pair = MongoDbClient().get_pairs_collection().find_one({"language": "python", "partition": "train" })
if pair is None:
    raise ValueError("Not found")

[code, comment] = EmbeddingGenerator().from_pairs([pair])

In [None]:
fig = plot_embedding_sample({
    "code_embedding": code.numpy()[0],
    "comment_embedding": comment.numpy()[0],
})
fig.show()

In [25]:
import os

import orjson
def create_tf_dataset_from_csnet(partition: str) -> Iterator:
  dataset_dir = os.path.join('../datasets/temp_python/python', 'final', 'jsonl', partition)
  # file_names = [os.path.join(dataset_dir, file_name) for file_name in os.listdir(dataset_dir) if file_name.endswith('.jsonl.gz')]
  file_names = [os.path.join(dataset_dir, 'python_train_0.jsonl.gz')]
  return tf.data.TextLineDataset(
    filenames=file_names,
    compression_type='GZIP',
    num_parallel_reads=tf.data.AUTOTUNE,
  )

expected = {"repo": "ageitgey/face_recognition", "path": "examples/face_recognition_knn.py", "func_name": "train", "original_string": "def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n    \"\"\"\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        \u251c\u2500\u2500 <person1>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u251c\u2500\u2500 <somename2>.jpeg\n        \u2502   \u251c\u2500\u2500 ...\n        \u251c\u2500\u2500 <person2>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u2514\u2500\u2500 <somename2>.jpeg\n        \u2514\u2500\u2500 ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree\n    :param verbose: verbosity of training\n    :return: returns knn classifier that was trained on the given data.\n    \"\"\"\n    X = []\n    y = []\n\n    # Loop through each person in the training set\n    for class_dir in os.listdir(train_dir):\n        if not os.path.isdir(os.path.join(train_dir, class_dir)):\n            continue\n\n        # Loop through each training image for the current person\n        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):\n            image = face_recognition.load_image_file(img_path)\n            face_bounding_boxes = face_recognition.face_locations(image)\n\n            if len(face_bounding_boxes) != 1:\n                # If there are no people (or too many people) in a training image, skip the image.\n                if verbose:\n                    print(\"Image {} not suitable for training: {}\".format(img_path, \"Didn't find a face\" if len(face_bounding_boxes) < 1 else \"Found more than one face\"))\n            else:\n                # Add face encoding for current image to the training set\n                X.append(face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0])\n                y.append(class_dir)\n\n    # Determine how many neighbors to use for weighting in the KNN classifier\n    if n_neighbors is None:\n        n_neighbors = int(round(math.sqrt(len(X))))\n        if verbose:\n            print(\"Chose n_neighbors automatically:\", n_neighbors)\n\n    # Create and train the KNN classifier\n    knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=knn_algo, weights='distance')\n    knn_clf.fit(X, y)\n\n    # Save the trained KNN classifier\n    if model_save_path is not None:\n        with open(model_save_path, 'wb') as f:\n            pickle.dump(knn_clf, f)\n\n    return knn_clf", "language": "python", "code": "def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n    \"\"\"\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        \u251c\u2500\u2500 <person1>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u251c\u2500\u2500 <somename2>.jpeg\n        \u2502   \u251c\u2500\u2500 ...\n        \u251c\u2500\u2500 <person2>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u2514\u2500\u2500 <somename2>.jpeg\n        \u2514\u2500\u2500 ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree\n    :param verbose: verbosity of training\n    :return: returns knn classifier that was trained on the given data.\n    \"\"\"\n    X = []\n    y = []\n\n    # Loop through each person in the training set\n    for class_dir in os.listdir(train_dir):\n        if not os.path.isdir(os.path.join(train_dir, class_dir)):\n            continue\n\n        # Loop through each training image for the current person\n        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):\n            image = face_recognition.load_image_file(img_path)\n            face_bounding_boxes = face_recognition.face_locations(image)\n\n            if len(face_bounding_boxes) != 1:\n                # If there are no people (or too many people) in a training image, skip the image.\n                if verbose:\n                    print(\"Image {} not suitable for training: {}\".format(img_path, \"Didn't find a face\" if len(face_bounding_boxes) < 1 else \"Found more than one face\"))\n            else:\n                # Add face encoding for current image to the training set\n                X.append(face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0])\n                y.append(class_dir)\n\n    # Determine how many neighbors to use for weighting in the KNN classifier\n    if n_neighbors is None:\n        n_neighbors = int(round(math.sqrt(len(X))))\n        if verbose:\n            print(\"Chose n_neighbors automatically:\", n_neighbors)\n\n    # Create and train the KNN classifier\n    knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=knn_algo, weights='distance')\n    knn_clf.fit(X, y)\n\n    # Save the trained KNN classifier\n    if model_save_path is not None:\n        with open(model_save_path, 'wb') as f:\n            pickle.dump(knn_clf, f)\n\n    return knn_clf", "code_tokens": ["def", "train", "(", "train_dir", ",", "model_save_path", "=", "None", ",", "n_neighbors", "=", "None", ",", "knn_algo", "=", "'ball_tree'", ",", "verbose", "=", "False", ")", ":", "X", "=", "[", "]", "y", "=", "[", "]", "# Loop through each person in the training set", "for", "class_dir", "in", "os", ".", "listdir", "(", "train_dir", ")", ":", "if", "not", "os", ".", "path", ".", "isdir", "(", "os", ".", "path", ".", "join", "(", "train_dir", ",", "class_dir", ")", ")", ":", "continue", "# Loop through each training image for the current person", "for", "img_path", "in", "image_files_in_folder", "(", "os", ".", "path", ".", "join", "(", "train_dir", ",", "class_dir", ")", ")", ":", "image", "=", "face_recognition", ".", "load_image_file", "(", "img_path", ")", "face_bounding_boxes", "=", "face_recognition", ".", "face_locations", "(", "image", ")", "if", "len", "(", "face_bounding_boxes", ")", "!=", "1", ":", "# If there are no people (or too many people) in a training image, skip the image.", "if", "verbose", ":", "print", "(", "\"Image {} not suitable for training: {}\"", ".", "format", "(", "img_path", ",", "\"Didn't find a face\"", "if", "len", "(", "face_bounding_boxes", ")", "<", "1", "else", "\"Found more than one face\"", ")", ")", "else", ":", "# Add face encoding for current image to the training set", "X", ".", "append", "(", "face_recognition", ".", "face_encodings", "(", "image", ",", "known_face_locations", "=", "face_bounding_boxes", ")", "[", "0", "]", ")", "y", ".", "append", "(", "class_dir", ")", "# Determine how many neighbors to use for weighting in the KNN classifier", "if", "n_neighbors", "is", "None", ":", "n_neighbors", "=", "int", "(", "round", "(", "math", ".", "sqrt", "(", "len", "(", "X", ")", ")", ")", ")", "if", "verbose", ":", "print", "(", "\"Chose n_neighbors automatically:\"", ",", "n_neighbors", ")", "# Create and train the KNN classifier", "knn_clf", "=", "neighbors", ".", "KNeighborsClassifier", "(", "n_neighbors", "=", "n_neighbors", ",", "algorithm", "=", "knn_algo", ",", "weights", "=", "'distance'", ")", "knn_clf", ".", "fit", "(", "X", ",", "y", ")", "# Save the trained KNN classifier", "if", "model_save_path", "is", "not", "None", ":", "with", "open", "(", "model_save_path", ",", "'wb'", ")", "as", "f", ":", "pickle", ".", "dump", "(", "knn_clf", ",", "f", ")", "return", "knn_clf"], "docstring": "Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        \u251c\u2500\u2500 <person1>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u251c\u2500\u2500 <somename2>.jpeg\n        \u2502   \u251c\u2500\u2500 ...\n        \u251c\u2500\u2500 <person2>/\n        \u2502   \u251c\u2500\u2500 <somename1>.jpeg\n        \u2502   \u2514\u2500\u2500 <somename2>.jpeg\n        \u2514\u2500\u2500 ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree\n    :param verbose: verbosity of training\n    :return: returns knn classifier that was trained on the given data.", "docstring_tokens": ["Trains", "a", "k", "-", "nearest", "neighbors", "classifier", "for", "face", "recognition", "."], "sha": "c96b010c02f15e8eeb0f71308c641179ac1f19bb", "url": "https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108", "partition": "train"}
for i in create_tf_dataset_from_csnet('train'):
  print()


In [28]:
from cs_net_parser import CSNetParser


cs_net_parser = CSNetParser()
for tensor in cs_net_parser.to_tf_dataset(partition='train', language='python'):
  sample = cs_net_parser.tensor_to_sample(tensor)
  if sample['url'] == 'https://github.com/mjirik/imcut/blob/1b38e7cd18a7a38fe683c1cabe1222fe5fa03aa3/imcut/graph.py#L586-L636':
    print(sample)
    break

FileNotFoundError: [Errno 2] No such file or directory: '/Users/beto/Projects/embedding_comparator/src/datasets/temp_python/python/final/jsonl/train'