In [13]:
from datasets import load_dataset
from datasets import Dataset

In [2]:
issues_ds = load_dataset("lewtun/github-issues", split="train")
issues_ds



Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [3]:
issues_ds = issues_ds.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_ds

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [4]:
columns = issues_ds.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_ds = issues_ds.remove_columns(columns_to_remove)
issues_ds

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [5]:
issues_ds[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': ['Cool, I think we can do both :)',
  '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).'],
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of m

In [7]:
len(issues_ds[0]["comments"])

2

In [8]:
issues_ds.set_format("pandas")
df = issues_ds[:]

In [11]:
df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [12]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [18]:
comments_ds = Dataset.from_pandas(comments_df)
comments_ds

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [19]:
type(comments_ds[0]["comments"])

str

In [20]:
comments_ds = comments_ds.map(lambda x: {"comment_len": len(x["comments"].split())})
comments_ds = comments_ds.filter(lambda x: x["comment_len"] > 15)
comments_ds

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_len'],
    num_rows: 2175
})

In [22]:
def cat(x):
    return {
        "text": x["title"] + "\n" + x["body"] + "\n" + x["comments"]
    }

comments_ds = comments_ds.map(cat)
comments_ds

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_len', 'text'],
    num_rows: 2175
})

In [23]:
from transformers import AutoTokenizer, AutoModel

In [25]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [26]:
model

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [29]:
comments_ds[0]["title"]

'Protect master branch'

In [35]:
input = tokenizer(comments_ds[0]["title"], padding=True, truncation=True, return_tensors="pt")
input

{'input_ids': tensor([[   0, 4051, 3044, 3593,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [46]:
tokenizer.decode(input["input_ids"].squeeze())

'<s> protect master branch </s>'

In [40]:
output = model(**input)
output

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.1652, -0.3874, -0.2802,  ..., -0.0648, -0.0930, -0.1708],
         [-0.0116,  0.0443, -0.2461,  ..., -0.1197, -0.0159, -0.1503],
         [-0.0918, -0.0714, -0.2280,  ..., -0.1481,  0.0234, -0.0936],
         [-0.0452, -0.2429, -0.2649,  ..., -0.1260,  0.0675, -0.0753],
         [-0.1162, -0.2402, -0.2831,  ..., -0.1805, -0.0239, -0.2220]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-1.0269e-01,  1.5401e-01,  4.0680e-02, -2.4256e-01, -2.3250e-01,
         -1.0266e-01,  3.5946e-02,  9.6521e-02, -1.7180e-01, -6.3049e-02,
          3.8489e-02, -2.6652e-01,  1.3987e-01,  7.9973e-02, -4.9818e-02,
         -1.0918e-01,  1.6311e-03,  1.3704e-01,  5.5705e-02, -1.3963e-01,
         -8.5225e-03, -1.1314e-01, -3.4650e-02,  1.5067e-02,  1.8766e-02,
          1.9006e-01, -1.2701e-01,  1.5054e-02,  2.6048e-01,  1.7298e-02,
          1.8773e-02, -1.5541e-01,  1.0155e-01,  8.8125e-02,  4.5931e-02,
          1.4480e-01, 

In [42]:
output.last_hidden_state.shape

torch.Size([1, 5, 768])

In [43]:
output.pooler_output.shape

torch.Size([1, 768])

In [48]:
output.last_hidden_state[:, 0].shape

torch.Size([1, 768])