In [None]:
from tqdm.auto import tqdm
import os
import pickle
import numpy as np
from datasets import DatasetDict
from datasets import load_dataset
from huggingface_hub import login



def calc_avg_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return sum(line_lengths) / len(line_lengths)

def calc_max_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return max(line_lengths)

def calc_alphanum_fraction(text):
    if len(text) == 0:
        return 0
    alphanum = sum(c.isalnum() for c in text)
    return alphanum / len(text)


def dataset_from_df(df):
    dataset = {
        'repo_name': [],
        'text': [],
        'avg_line_length': [],
        'max_line_length': [],
        'alphnanum_fraction': [],
    }
    for i in tqdm(range(len(df))):
        repo = df.iloc[i]
        text = repo['text']
        dataset['repo_name'].append(repo['repo_name'])
        dataset['text'].append(text)
        dataset['avg_line_length'].append(calc_avg_line_length(text))
        dataset['max_line_length'].append(calc_max_line_length(text))
        dataset['alphnanum_fraction'].append(calc_alphanum_fraction(text))
    dataset = pd.DataFrame(dataset)
    return dataset


def split_dataset(hf_dataset):
    train_test_ds = hf_dataset['train'].train_test_split(test_size=0.3)
    test_valid = train_test_ds['test'].train_test_split(test_size=0.3)
    train_test_valid_dataset = DatasetDict({
        'train': train_test_ds['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})
    return train_test_valid_dataset

def huggingface_dataset_from_df(df):
    dataset = dataset_from_df(df)
    with open('hf_ds.pkl', 'wb') as f:
        pickle.dump(dataset, f)
    hf_dataset = load_dataset("pandas", data_files='hf_ds.pkl')
    os.remove('hf_ds.pkl')
    hf_dataset = split_dataset(hf_dataset)
    return hf_dataset


In [None]:
import pandas as pd
file_name = './security.csv'
df = pd.read_csv(file_name)
df.rename(columns={'code':'text'}, inplace=True)
df.head()

In [None]:
hf_dataset = huggingface_dataset_from_df(df)
hf_token = os.environ['HUGGINGFACE_TOKEN']
login(token=hf_token)
hf_dataset.push_to_hub("Python-Security-Code-Dataset")




In [None]:
security_df

In [None]:
import pandas as pd
file_name = './react2.csv'
df = pd.read_csv(file_name)
df.rename(columns={'JS_files':'text'}, inplace=True)

df.head()

In [None]:
df_react = pd.read_csv('./react2.csv')
df_react.rename(columns={'text':'code'}, inplace=True)
df_security = pd.read_csv('./security.csv')
df_react['query'] = 'react 18.0'

#fill nan with the most common value for each column in each dataframe
df_react = df_react.apply(lambda x:x.fillna(x.value_counts().index[0]))
df_security = df_security.apply(lambda x:x.fillna(x.value_counts().index[0]))

# drop nan from both dataframes and shuffle then reset index
df_react.dropna(inplace=True)
df_security.dropna(inplace=True)
df_react = df_react.sample(frac=1).reset_index(drop=True)
df_security = df_security.sample(frac=1).reset_index(drop=True)

print(df_react.shape)
print(df_security.shape)

#put column in the same order [repo_name, file_path, code, query]
df_react = df_react[['repo_name', 'file_path', 'code', 'query']]
df_security = df_security[['repo_name', 'file_path', 'code', 'query']]


#Save the dataframes back to csv as processed data
df_react.to_csv('./react_processed.csv', index=False)
df_security.to_csv('./security_processed.csv', index=False)

In [None]:
df_react.head()

In [None]:
df_security.head()

In [None]:
hf_dataset = huggingface_dataset_from_df(df)
hf_token = os.environ['HUGGINGFACE_TOKEN']
login(token=hf_token)
hf_dataset.push_to_hub("Python-React-Code-Dataset")



In [None]:
import pandas as pd
filename = 'loss.csv'
loss_df = pd.read_csv(filename)
cols = loss_df.columns
for col in cols:
    if col.endswith('loss'):
        loss_df.rename(columns={col:'loss'}, inplace=True)
    elif col.endswith('Step'):
        continue
    else:
        loss_df.drop(col, axis=1, inplace=True)
        
loss_df

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_df['loss'])

ema = loss_df['loss'].ewm(span=20).mean()
plt.plot(ema)

In [None]:
ema

# Merge LoRa Model with Base Model

## Setp 0: Import packages

In [None]:
import os
hf_token =''
os.environ['HUGGINGFACE_TOKEN'] = hf_token

In [None]:
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import os
login(token=os.environ['HUGGINGFACE_TOKEN'])
LORA_MODEL_ID = "MuhammedSaeed/LLMJS"


## Step 1 : Load the LoRa Model and the Base Model

In [None]:
config = PeftConfig.from_pretrained(LORA_MODEL_ID)
base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,trust_remote_code=True)
lora_model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID)
print(lora_model)
print(base_model)


## Step 2: Merge and Unload the model

In [None]:
merged_model = lora_model.merge_and_unload()
print(merged_model)

## Step 3: Push the model to the Hub

In [None]:
MERGED_MODEL_ID = "ammarnasr/codegen2-1B-react"
merged_model.push_to_hub(MERGED_MODEL_ID, use_auth_token=True)
tokenizer.push_to_hub(MERGED_MODEL_ID, use_auth_token=True)

## Step 4: Finalizing the model for deployment
- Copy the config.json file from the base model to the merged model
- Copy the configuration*.py file from the base model to the merged model
- Copy the modelling*.py file from the base model to the merged model

## Step 5: Inferencing the model

In [1]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "ammarnasr/codegen2-1B-react"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/166k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
text = "def hello_world():"
input_ids = tokenizer(text, return_tensors="pt").input_ids
generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
