In [1]:
# https://wikidocs.net/233348

In [2]:
# pip install langchain

In [3]:
# pip install -U langchain-ollama

In [4]:
# pip install langchain_teddynote
#  ollama create llama-3.1-70b-instruct-lorablated.Q4_K_M:latest -f Modelfile

In [5]:

import warnings
import langchain
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_teddynote.messages import stream_response
from langchain_core.prompts import ChatPromptTemplate
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

warnings.filterwarnings("ignore", category=DeprecationWarning)
# https://wikidocs.net/233348

In [6]:
df = pd.read_csv('../../data/part_one_q_output.csv', index_col = 0)

In [None]:
df['question'][0]

In [8]:
llm = OllamaLLM(model="llama-3.1-70b-instruct-lorablated.Q4_K_M:latest")
parser = StrOutputParser()
# llm = OllamaLLM(model="llama3.1:70b")


In [None]:
df.shape[0]

In [10]:
df_plot = df['answer'].value_counts().to_frame().reset_index()

In [None]:
df_plot.loc[df_plot['answer']=='Difficulty class : Basic', 'count'].values[0]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(3)
di_dict = {}
for di in ['Difficulty class : Basic', 'Difficulty class : Intermediate', 'Difficulty class : Advanced']:
    di_dict[di] = df_plot.loc[df_plot['answer']==di, 'count'].values[0]
print(list(di_dict.values()))

In [None]:
plt.bar(  x
        , list(di_dict.values())
        , color = ['gold', 'darkgreen', 'lightskyblue']
        , width = 0.8
        , label = list(di_dict.keys()))
plt.xticks(x, [x.replace('Difficulty class : ', '') for x in list(di_dict.keys())] )
# plt.legend()

plt.ylabel('# of Questions')
plt.xlabel('Difficulty Class')
# plt.xticks(rotation = 45)
plt.title('Distribution of Questions Difficulty')

plt.show()


In [14]:
diff_idx = {x : list(df[df['answer']==x].index) for x in list(di_dict.keys())}

In [None]:
diff_s_idx = {}
for key, value in diff_idx.items():
    print(key, value)
    dic_col = f'{key}_sample_idx'
    diff_population = np.arange(len(value))
    np.random.seed(1111)
    diff_s_idx[dic_col] = np.random.choice(diff_population, size=2, replace=False)

In [None]:
diff_s_idx

In [17]:
from itertools import chain

fewshot_q_id = list(chain.from_iterable(diff_s_idx.values()))


In [18]:
eval_q_id =np.setdiff1d(list(df.index), fewshot_q_id)

In [19]:
examples = []
for idx in fewshot_q_id:
# for idx in [19]:
    temp_dict = {"question" : str(df.loc[idx, 'question']),
                 "answer"   : str(df.loc[idx, 'answer'])}
    examples.append(temp_dict)

In [None]:
example_prompt = PromptTemplate.from_template(
    "Question:\n{question}\nAnswer:\n{answer}"
)

print(example_prompt.format(**examples[0]))

In [22]:
prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Question:\n{question}\nAnswer:",
    input_variables=["question"],
)

In [None]:
eval_result = pd.DataFrame(columns = ['id', 'result'])
for idx in tqdm(eval_q_id):
    question = df.loc[idx, 'question']
    final_prompt = prompt.format(question=question)
    chain = prompt | llm | parser

    # chain 호출
    response = chain.invoke({"question": question})
    tmp_dict = {'id' : df.loc[idx, 'id']
                ,'result' : response}
    eval_result = pd.concat([eval_result, pd.DataFrame([tmp_dict])], ignore_index=True)

In [23]:
result_df = pd.merge(df,eval_result, on = 'id' )

In [None]:
result_df.head()

In [25]:
result_df['answer_diff_class'] = [x[1] for x in result_df['answer'].str.split(r"[|:]+")]


In [26]:
result_df['result_diff_class']  = [x[0] for x in result_df['result'].str.split(r"[|:]+")]


In [None]:
result_df.head()

In [None]:
result_df

In [46]:
result_df['answer_diff_class'] = result_df['answer_diff_class'].str.strip()
result_df['result_diff_class'] = result_df['result_diff_class'].str.strip()

In [47]:
result_df['answer_diff_class_0'] = [x[0] for x in result_df['answer_diff_class'].str.split(' ')]
result_df['result_diff_class_0'] = [x[0] for x in result_df['result_diff_class'].str.split(' ')]

In [48]:
result_df['equal_yn'] = np.where(result_df['answer_diff_class_0']==result_df['result_diff_class_0'], 1, 0)

In [None]:
acc = (result_df['equal_yn'].sum()/result_df.shape[0])*100
print(acc)

In [None]:
result_df[result_df['equal_yn'] ==0]

df