In [None]:
!pip install google-cloud-aiplatform


In [None]:
REGION = "us-central1"

In [None]:
import vertexai

In [None]:
vertexai.init(project = PROJECT_ID,
              location = REGION,
              credentials = credentials)

In [None]:
from google.cloud import bigquery

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID,
                            credentials = credentials)

In [None]:
QUERY_TABLES = """
SELECT
  table_name
FROM
  `bigquery-public-data.stackoverflow.INFORMATION_SCHEMA.TABLES`
"""

In [None]:
query_job = bq_client.query(QUERY_TABLES)

In [None]:
for row in query_job:
    for value in row.values():
        print(value)

In [None]:
INSPECT_QUERY = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_questions`
LIMIT 3
"""

In [None]:
import pandas as pd

In [None]:
query_job = bq_client.query(INSPECT_QUERY)

In [None]:
stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
stack_overflow_df.head()

In [None]:
QUERY_ALL = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
"""

In [None]:
query_job = bq_client.query(QUERY_ALL)

In [None]:
try:
    stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
except Exception as e:
    print('The DataFrame is too large to load into memory.', e)

In [None]:
QUERY = """
SELECT
    CONCAT(q.title, q.body) as input_text,
    a.body AS output_text
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
JOIN
    `bigquery-public-data.stackoverflow.posts_answers` a
ON
    q.accepted_answer_id = a.id
WHERE
    q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND
    a.creation_date >= "2020-01-01"
LIMIT
    10000
"""

In [None]:
query_job = bq_client.query(QUERY)

In [None]:
### this may take some seconds to run
stack_overflow_df = query_job.result()\
                        .to_arrow()\
                        .to_pandas()

stack_overflow_df.head(2)

In [None]:
INSTRUCTION_TEMPLATE = f"""\
Please answer the following Stackoverflow question on Python. \
Answer it like you are a developer answering Stackoverflow questions.

Stackoverflow question:
"""

In [None]:
stack_overflow_df['input_text_instruct'] = INSTRUCTION_TEMPLATE + ' '\
    + stack_overflow_df['input_text']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, evaluation = train_test_split(
    stack_overflow_df,
    ### test_size=0.2 means 20% for evaluation
    ### which then makes train set to be of 80%
    test_size=0.2,
    random_state=42
)

In [None]:
import datetime

In [None]:
date = datetime.datetime.now().strftime("%H:%d:%m:%Y")

In [None]:
cols = ['input_text_instruct','output_text']
tune_jsonl = train[cols].to_json(orient="records", lines=True)

In [None]:
training_data_filename = f"tune_data_stack_overflow_\
                            python_qa-{date}.jsonl"

In [None]:
with open(training_data_filename, "w") as f:
    f.write(tune_jsonl)