# Data Preprocessing

In [14]:
import os

folder_path = "./dataset/raw"

txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
file_contents = {}

for txt_file in txt_files:
    file_path = os.path.join(folder_path, txt_file)
    with open(file_path, 'r', encoding='utf-8') as f:
        file_contents[txt_file.replace('.txt', '')] = f.read()

In [15]:
# file_contents['lecture1']

In [16]:
import re

def clean_text(text):
    # 시간표시, 링크, 추임새, 공백 정리
    text = re.sub(r'\(\d{2}:\d{2}\)', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\b(all right|hey|um|uh)\b[.,]?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\S\n]+', ' ', text).strip()
    
    return text

def split_text_into_chucks(text, num_chunks=5):
    # 강의 텍스트를 5개의 chucks로 분리
    chunk_size = len(text) // num_chunks
    
    chunks = []
    
    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size if i < num_chunks - 1 else len(text)
        chunks.append(text[start:end])
    
    return chunks

In [17]:
for key in file_contents:
    text_cleaned = clean_text(file_contents[key])
    text_chuncks = split_text_into_chucks(text_cleaned)
    file_contents[key] = text_chuncks

# file_contents['lecture1']

# Generate QA Pair with ChatGPT Assistants

In [70]:
import json
import os
import openai
import time
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.environ.get('OPENAI_KEY')
ASSISTANT_ID = os.environ.get('ASSISTANT_ID')

In [36]:
prompt_first = 'I will provide 5 consecutive lecture transcript parts. After each part, please respond with "Understood" if you have received and understood it. After I provide the fifth part, please generate a dataset of 30 Q&A pairs based on the entire transcript.'

prompt_last = """
    I have now provided all 5 parts of the transcript. Please create a 30 Q&A datasets focused on the principles of AI, relevant mathematics, and practical applications based on the lecture.
    
    Make sure the questions cover both theoretical knowledge and how the concepts can be applied in real-world scenarios.
    
    Format the response strictly in JSON, using the following structure:

    [
      {
        "Question": "",
        "Answer": ""
      },
      {
        "Question": "",
        "Answer": ""
      },
      ...
    ]
"""

In [45]:
def create_new_thread():
  thread = openai.beta.threads.create()
  return thread

def submit_message(assistant_id, thread_id, message):
  openai.beta.threads.messages.create(
      thread_id=thread_id,
      role='user',
      content=message
  )
  run = openai.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id
  )
  return run

def wait_on_run(run, thread):
  while run.status == "queued" or run.status == "in_progress":
    run = openai.beta.threads.runs.retrieve(
        thread_id=thread,
        run_id=run.id,
    )
    time.sleep(0.5)
  return run

def get_response(thread_id):
  return openai.beta.threads.messages.list(thread_id=thread_id, order="asc")

def print_message(response):
  for res in response:
    print(f"{res.role.upper()} >> {res.content[0].text.value}\n")

In [38]:
THREAD_ID = create_new_thread().id

In [46]:
run = submit_message(ASSISTANT_ID, THREAD_ID, prompt_first)
run1 = wait_on_run(run, THREAD_ID)

print_message(get_response(THREAD_ID).data[-2:])

USER >> I will provide 5 consecutive lecture transcript parts. After each part, please respond with "Understood" if you have received and understood it. After I provide the fifth part, please generate a dataset of 30 Q&A pairs based on the entire transcript.

ASSISTANT >> Understood. Please provide the first part of the lecture transcript.



In [47]:
lec10 = file_contents['lecture10']
for i in range(5):
  run = submit_message(ASSISTANT_ID, THREAD_ID, lec10[i])
  run1 = wait_on_run(run, THREAD_ID)

  print_message(get_response(THREAD_ID).data[-2:])

USER >> Stanford CS230: Deep Learning | Autumn 2018 | Lecture 10 - Chatbots / Closing Remarks - YouTube


Transcript:
 So hello everyone, and welcome for the last lecture of CS230 Deep Learning. So it's been ten weeks and you've been, you've been studying deep learning all around starting with fully connected networks, understanding how to boost these networks and make them better and then, using recurrent neural networks in the last part as- and convolutional neural networks in the fourth part, to build models for imaging and text and other applications.
 So today's class wrap-up and, the lecture might be, slightly, shorter than usual. But we're going to go over a small case study on conversational assistants to start with which is an industrial topic. we will do a small quiz competition with Menti, and the fastest person who has the best answer will win 400 hours of GPU credits on Amazon.
 [LAUGHTER] So you guys can, can start, can start working on it. we will see some class project 

USER >> o request information from another server.
 And in this case, it might be Axess because the platform we use to enroll in classes is Axess. And then, to retrieve information in order to help the user about their classes, we can probably call Explorecourses, assuming that these, these services have APIs, these services have APIs. Does that make sense? And now, the interesting part is that the enroll function might request some input that you have to identify.
 Those will be the slots, same for the inform function, okay? So, we could train a sequence classifier, either convolutional or record. And this, we're not going to go into the details, you've learned it in the sequence models class. How to detect the slots now. So in terms of data, it's going to look very similar to the previous one, but we will have a sequence to sequence problem now where the user utterance will be, a sequence of words and the slots tag will also be a sequence.
 So, for example, show me the Tuesday the 5t

USER >> his date augmentation makes sense? So these are common tricks you would see in- in various papers and this is an example of one of them. Okay. So we can label automatically when inserting and we can train a sequence-to-sequence model in order to fill in the slots. Okay.
 So let's go on Menti and start the competition, which is the- the most fun. Okay. So let's get back to- to -to where we were. We have a chatbot that is able to answer, for sure, I just enrolled you. The way it does that is that, it receives the user utterance. I want to enroll in CS106A, Winter 2019 to learn coding. It identifies the intent of the user using a sequence classifier, same type of network as you've built for the Emojify assignment.
 And then it also runs another algorithm, which will fill in the slots and here we have all the slots needed. We have the code for the class, we have the quarter and we have the year, the student ID is implicitly given. So we're able to enroll- to enroll the students by 

USER >> this dialogue and the task is for you to rate each of, the responses between one inappropriate, doesn't make sense, to five, 
 highly appropriate and interesting based on how appropriate the response is, to continue the conversation, three is neutral and, if two responses are equally appropriate, you should give them the same score and if you see a response that is not in English, please give it one score. So here's what happens, from a user perspective.
 You will have a conversation, you need to work on your English. Why do you say tha- that about me? well your English is very poor. So this is a conversation. And then the response 1 is, but English is my native language. Response 2 is, what are the reasons come to mind? Response 3 is, here is a funny fact.
 Go is the shortest complete sentence in the English language and then the fourth response is bye doggie [LAUGHTER]. So obviously you have to, you have to score, you have to score these, these responses according to what you

USER >> beans and I love my wife.
 I drink the coffee she makes but sometimes she burns the coffee beans. So I- I found this article on the Internet from a former student that had written an article on how they use machine learning to roast- to - to optimize the roasting of coffee beans, and so I forwarded it to Carol, She wasn't very happy about that. [LAUGHTER], 
 And I raised this as another [LAUGHTER] example of how, all of you, - you know, I would never have thought of applying machine learning to roasting coffee beans. It's just- I mean, you know, I like my coffee but it had never occurred to me to do that. But someone taking a machine learning class, like you guys are, go ahead and come up with a better way of roasting coffee beans using learning algorithms.
 and again, I think- I don't know if this particular person who wrote this blogpost was thinking of building a business out of it. I, I don't know. There might be a business there, there might not, or it might be just a fun 

In [48]:
run = submit_message(ASSISTANT_ID, THREAD_ID, prompt_last)
run1 = wait_on_run(run, THREAD_ID)

print_message(get_response(THREAD_ID).data[-2:])

USER >> 
    I have now provided all 5 parts of the transcript. Please create a 30 Q&A datasets focused on the principles of AI, relevant mathematics, and practical applications based on the lecture.
    
    Make sure the questions cover both theoretical knowledge and how the concepts can be applied in real-world scenarios.
    
    Format the response strictly in JSON, using the following structure:

    [
      {
        "Question": "",
        "Answer": ""
      },
      {
        "Question": "",
        "Answer": ""
      },
      ...
    ]


ASSISTANT >> ```json
[
  {
    "Question": "What does the acronym RNN stand for?",
    "Answer": "Recurrent Neural Network."
  },
  {
    "Question": "Why might convolutional networks work better than recurrent neural networks in some cases?",
    "Answer": "Because they can detect intent by scanning small number of words within the input sequence with trained filters."
  },
  {
    "Question": "What is the purpose of slots in conversational 

In [63]:
response = get_response(THREAD_ID).data[-1:]
json_string =response[0].content[0].text.value

clean_string = json_string.replace('```json', '').replace('```', '').strip()
json_data = json.loads(clean_string)
json_data

[{'Question': 'What does the acronym RNN stand for?',
  'Answer': 'Recurrent Neural Network.'},
 {'Question': 'Why might convolutional networks work better than recurrent neural networks in some cases?',
  'Answer': 'Because they can detect intent by scanning small number of words within the input sequence with trained filters.'},
 {'Question': 'What is the purpose of slots in conversational assistants?',
  'Answer': 'Slots gather multiple information from the user on a specific intent, such as class code, quarter, and year for course enrollment.'},
 {'Question': 'What are the two intents defined for the chatbot in the lecture?',
  'Answer': 'The intents are finding information about a course and enrolling in a class.'},
 {'Question': 'How can one train a model to detect user intent?',
  'Answer': 'By labeling the dataset with pairs of user utterances and their corresponding intent, then using either a recurrent neural network or a convolutional neural network.'},
 {'Question': 'What i

# Json to CSV

In [67]:
import csv

csv_file_path = './dataset/csv/lecture10_QnA.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Question", "Answer"])
    
    writer.writeheader()
    
    for row in json_data:
        writer.writerow(row)