<a href="https://colab.research.google.com/github/arnabksarkar/LLMFinetuning/blob/dev/automate_data_import.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'gemma/keras/gemma_instruct_2b_en/2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F5388%2F11372%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241012%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241012T154755Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D580f0d0d884b6dfb172aa9e519e6312f92cea0363a15fd5debcecc5a78a91852053f96a989f95c43ebf589780d8ff656f01d79a3780afe0e111b4e3b8d65075bb81c7037148d604e54c3b3f0988979892b617f1481e33129c97a2ab41b347316a834843f718cbf94f1cdf4015e1723659bd379f45585645435a49a09cc2a7013d57bbb2fc311339b3d9fd6f1883847bdc525be2985d539979524871984eac1427a808a29dc0058456899945202d98c669968a9e5063a29258a2ead2550c3db2acf868916c24849bed3a55879442736d84917556a5f13f5b8ccd7d28ed919641ffa4f88abbb8bf30f49101def7cd78f8549ff4d95cf752a1570c6b3d268ed9e8c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


## Q&A Generation Version

In [None]:
# from openai import OpenAI

import pandas as pd

# import openai

import os

import glob


In [None]:
%load_ext dotenv

%dotenv

cannot find .env file


In [None]:
!pip install langchain langchain-google-vertexai



In [None]:
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".

# Avoid memory fragmentation on JAX backend.

os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

# Import packages

Import Keras and KerasNLP.

**Note - If you are on Mac M, please use conda-forge to install the jax libraries**

In [None]:
import keras

import keras_nlp

In [None]:
from langchain_google_vertexai import GemmaLocalKaggle

In [None]:
= GemmaLocalKaggle(model_name="gemma_instruct_2b_en", keras_backend="jax")

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


# Generate QA function

Create the Question and answer function

In [None]:
def generate_question_and_answer(text_chunk, gemma_llm):



    # Define the question prompt

    question_prompt = f'''You are a Professor writing an exam.

                        Using the provided context: '{text_chunk}',

                        formulate a single question that captures an important fact or insight from the context,

                        e.g. 'Who was Aristophanes?' or 'What are latifundia?'

                          or 'What is ostracism?' or 'Where did Xerxes cross the Hellespont?'

                          or 'When did the battle of Platea occur?' or 'Why did Christianity appeal to slaves?'

                          or 'How did Athens stop class warfare during the Periclean age?'.

                        Restrict the question to the context information provided.'''



    # Generate a question unconditionally
    question_response = gemma_llm.invoke(question_prompt, max_tokens=100)
    question = question_response


    # Generate an answer unconditionally
    answer_prompt = f'''Given the context: '{text_chunk}',
            give a detailed, complete answer to the question: '{question}'.
            Use only the context to answer, do not give references.
            Simply answer the question without editorial comments.'''

    answer_response = gemma_llm.invoke(answer_prompt, max_tokens=100)
    answer = answer_response.strip()

    return question, answer

In [None]:
text_context = '''West Bengal is on the eastern neck of India, stretching from the Himalayas in the north to the Bay of Bengal in the south.
                It lies between 85 degree 50 minutes and 89 degree 50 minutes east longitude, and 21 degrees 25 minutes and 27 degrees 13 minutes north latitude.[1]
                The state has a total area of 88,752 square kilometres (34,267 sq mi).[2] With Bangladesh on its eastern border, the state forms the ethno-linguistic region of Bengal. To its northeast lie the states of Assam and Sikkim and the country of Bhutan.
                To its southwest is the state of Odisha. To the west, it borders the states of Jharkhand and Bihar, and to the northwest, Nepal.
                The capital of the state is Kolkata, the third-largest urban agglomeration and the seventh-largest city in India.'''

In [None]:
generate_question_and_answer(text_context,)


("You are a Professor writing an exam. \n\n                        Using the provided context: 'West Bengal is on the eastern neck of India, stretching from the Himalayas in the north to the Bay of Bengal in the south. \n                It lies between 85 degree 50 minutes and 89 degree 50 minutes east longitude, and 21 degrees 25 minutes and 27 degrees 13 minutes north latitude.[1] \n                The state has a",
 "Given the context: 'West Bengal is on the eastern neck of India, stretching from the Himalayas in the north to the Bay of Bengal in the south. \n                It lies between 85 degree 50 minutes and 89 degree 50 minutes east longitude, and 21 degrees 25 minutes and 27 degrees 13 minutes north latitude.[1] \n                The state has a total area of 88,752 square kilometres")