In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
from tqdm import tqdm

train_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-train_all_classified.json"
valid_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-valid.jsonl"

# Step 1: Convert dataset to the required format
def convert_to_chat_format(train_path, valid_path):
    """
    Converts the dataset into a list of dicts with 'content' and 'role' keys.
    """
    # Load datasets
    train_df = pd.read_json(train_path, lines=True)
    valid_df = pd.read_json(valid_path, lines=True)

    # Helper function to process each dataset
    def process_dataset(df):
        formatted_data = []
        for _, row in tqdm(df.iterrows(), total=len(df)):
            # First dict: user input
            user_dict = {"content": row["patch"], "role": "user"}
            # Second dict: assistant output
            assistant_dict = {"content": row["msg"], "role": "assistant"}
            # Add both to the list as a conversation pair
            formatted_data.append([user_dict, assistant_dict])
        return formatted_data

    # Process train and validation datasets
    train_data = process_dataset(train_df)
    valid_data = process_dataset(valid_df)

    return train_data, valid_data

# Step 2: Push to Hugging Face Hub
def push_to_hub(train_data, valid_data, repo_name, token):
    """
    Pushes the processed dataset to Hugging Face Hub.
    """
    # Convert lists of dicts to Hugging Face Dataset format
    train_dataset = Dataset.from_dict({"conversations": train_data})
    valid_dataset = Dataset.from_dict({"conversations": valid_data})

    # Combine into a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": valid_dataset,
    })

    # Save locally before pushing (optional)
    dataset_dict.save_to_disk("./processed_dataset")

    # Push to Hugging Face Hub
    dataset_dict.push_to_hub(repo_name, token=token)



In [5]:
# File paths for your datasets
train_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-train_all_classified.json"
valid_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-valid.jsonl"

# Convert datasets to chat format
train_data, valid_data = convert_to_chat_format(train_path, valid_path)

# Hugging Face Hub details
repo_name = "conferencesubmissionmodel/crscore_SFT"  # Replace with your repo name on HF Hub
hf_token = "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"  # Replace with your personal access token


100%|██████████| 109274/109274 [00:05<00:00, 18914.85it/s]
100%|██████████| 10319/10319 [00:00<00:00, 21406.32it/s]


In [7]:
# Push dataset to the Hub
push_to_hub(train_data, valid_data, repo_name, hf_token)

Saving the dataset (0/1 shards):   0%|          | 0/109274 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10319 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/110 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
from tqdm import tqdm

train_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-train_all_classified.json"
train_df = pd.read_json(train_path, lines=True)

In [None]:
train_df.head()

In [None]:
# tokenize first 1000 samples and then give me avg token length

token_lens = []
for i in range(1000):
    token_lens.append(len(tokenizer(train_df["oldf"][i]+train_df["patch"][i])["input_ids"]))

In [3]:
# from unsloth import FastLanguageModel
# from transformers import AutoTokenizer
import torch
import pandas as pd
import numpy as np

valid_path = "/home/mkapadni/work/crscore_plus_plus/Comment_Generation/msg-valid.jsonl"

valid_df = pd.read_json(valid_path, lines=True)

In [4]:
valid_df.head()

Unnamed: 0,patch,y,oldf,idx,id,msg,proj,lang
0,"@@ -231,4 +231,8 @@ def setup_app(app):\n ...",1,# -*- coding: utf-8 -*-\n#\n# This file is par...,1,16014,Should we call it `is_list`?,inveniosoftware-invenio,py
1,"@@ -44,7 +44,7 @@ namespace OpenTelemetry.Trac...",1,"// <copyright file=""TracerProviderBuilderExten...",1,18299,"in the instrumentation example, should we use ...",open-telemetry-opentelemetry-dotnet,.cs
2,"@@ -25,7 +25,7 @@ from scapy.modules.six.moves...",1,## This file is part of Scapy\n## See http://w...,1,12313,Why this change ? Is it useful ?,secdev-scapy,py
3,"@@ -0,0 +1,4 @@\n+const titleNode = virtualNod...",1,,1,15216,"I know this is a nitpick, but don't we always ...",dequelabs-axe-core,js
4,"@@ -37,6 +37,11 @@ public class EMailValidator...",1,package edu.harvard.iq.dataverse;\n\nimport st...,1,37751,We should reformat this emails in the test to ...,IQSS-dataverse,java


In [5]:
print(valid_df['patch'][0])

@@ -231,4 +231,8 @@ def setup_app(app):
         )
         return rv
 
+    @app.template_test('list')
+    def _is_list(value):
+        return isinstance(value, list)
+
     return app


In [6]:
print(valid_df['oldf'][0])

# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2012, 2013, 2014, 2015 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

"""Additional extensions and filters for jinja2 module."""

import re

from flask import _request_ctx_stack, g, request, url_for

from flask_login import current_user

from jinja2 import ChoiceLoader

from six

In [18]:
def get_patch_context(oldf, patch, n=3):
    """
    Display code patch with n lines of context before and after.
    
    Args:
        oldf (str): Original file content
        patch (str): Patch content in unified diff format
        n (int): Number of context lines before and after
        
    Returns:
        str: Integrated code with context
    """
    import re
    
    # Split file into lines
    oldf_lines = oldf.split('\n')
    
    # Extract line number from patch header
    match = re.search(r'@@ -(\d+)', patch)
    if not match:
        return "Could not determine patch location"
    
    # Get starting line number (1-based) and convert to 0-based index
    start_line = int(match.group(1))
    idx = start_line - 1
    
    # Calculate context ranges
    before_start = max(0, idx - n)
    after_start = idx
    after_end = min(len(oldf_lines), after_start + n)
    
    # Extract added lines from patch (remove '+' prefix)
    added_lines = []
    found_header = False
    for line in patch.split('\n'):
        if line.startswith('@@'):
            found_header = True
            continue
        if found_header:
            if line.startswith('+'):
                added_lines.append(line[1:])
    
    # Build result
    result = []
    
    # Lines before patch
    for i in range(before_start, idx):
        result.append(oldf_lines[i])
    
    # Patch content (integrated)
    result.extend(added_lines)
    
    # Lines after patch
    for i in range(after_start, after_end):
        result.append(oldf_lines[i])
    
    return "\n".join(result)

In [23]:
oldf = valid_df['oldf'][0]
patch = valid_df['patch'][0]
n = 5 # Number of lines before and after

result = get_patch_context(oldf, patch, n)
print(result)

        rv = append + u'&'.join(
            u'%s=%s' % (escape(key), escape(value))
            for key, value in d.iteritems(True)
            if value is not None and key not in filter
            # and not isinstance(value, Undefined)
    @app.template_test('list')
    def _is_list(value):
        return isinstance(value, list)

        )
        return rv

    return app

