# Data Preprocessing

In [1]:
import os

folder_path = "./dataset/raw"

txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
file_contents = {}

for txt_file in txt_files:
    file_path = os.path.join(folder_path, txt_file)
    with open(file_path, 'r', encoding='utf-8') as f:
        file_contents[txt_file.replace('.txt', '')] = f.read()

In [2]:
# file_contents['lecture1']

In [3]:
import re

def clean_text(text):
    # 시간표시, 링크, 추임새, 공백 정리
    text = re.sub(r'\(\d{2}:\d{2}\)', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\b(all right|hey|um|uh)\b[.,]?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\S\n]+', ' ', text).strip()
    
    return text

def split_text_into_chucks(text, num_chunks=5):
    # 강의 텍스트를 5개의 chucks로 분리
    chunk_size = len(text) // num_chunks
    
    chunks = []
    
    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size if i < num_chunks - 1 else len(text)
        chunks.append(text[start:end])
    
    return chunks

In [4]:
for key in file_contents:
    text_cleaned = clean_text(file_contents[key])
    text_chuncks = split_text_into_chucks(text_cleaned)
    file_contents[key] = text_chuncks

# file_contents['lecture1']

# Generate QA Pair with ChatGPT Assistants

In [6]:
import json
import os
import openai
import time
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.environ.get('OPENAI_KEY')
ASSISTANT_ID = os.environ.get('ASSISTANT_ID')

In [7]:
def create_new_thread():
  thread = openai.beta.threads.create()
  return thread

def submit_message(assistant_id, thread_id, message):
  openai.beta.threads.messages.create(
      thread_id=thread_id,
      role='user',
      content=message
  )
  run = openai.beta.threads.runs.create(
      thread_id=thread_id,
      assistant_id=assistant_id
  )
  return run

def wait_on_run(run, thread):
  while run.status == "queued" or run.status == "in_progress":
    run = openai.beta.threads.runs.retrieve(
        thread_id=thread,
        run_id=run.id,
    )
    time.sleep(0.5)
  return run

def get_response(thread_id):
  return openai.beta.threads.messages.list(thread_id=thread_id, order="asc")

def print_message(response):
  for res in response:
    print(f"{res.role.upper()} >> {res.content[0].text.value}\n")

In [88]:
THREAD_ID = create_new_thread().id

In [89]:
def send_prompt(thread_id, assis_id, prompt):
    run = submit_message(assis_id, thread_id, prompt)
    run_complete = wait_on_run(run, thread_id)

    print_message(get_response(thread_id).data[-2:])

In [90]:
prompt_first = 'I will provide 5 consecutive lecture transcript parts. After each part, please respond with "Understood" if you have received and understood it. After I provide the fifth part, please generate a dataset of 30 Q&A pairs based on the entire transcript.'

prompt_last = """
    I have now provided all 5 parts of the transcript. Please create a 30 Q&A datasets focused on the principles of AI, relevant mathematics, and practical applications based on the lecture.
    
    Make sure the questions cover both theoretical knowledge and how the concepts can be applied in real-world scenarios.
    
    Format the response strictly in JSON, using the following structure:

    [
      {
        "Question": "",
        "Answer": ""
      },
      {
        "Question": "",
        "Answer": ""
      },
      ...
    ]
"""

In [91]:
send_prompt(THREAD_ID, ASSISTANT_ID, prompt_first)

USER >> I will provide 5 consecutive lecture transcript parts. After each part, please respond with "Understood" if you have received and understood it. After I provide the fifth part, please generate a dataset of 30 Q&A pairs based on the entire transcript.

ASSISTANT >> Understood



In [92]:
lec = file_contents['lecture1']
for i in range(5):
  send_prompt(THREAD_ID, ASSISTANT_ID, lec[i])

USER >> Stanford CS230: Deep Learning | Autumn 2018 | Lecture 1 - Class Introduction & Logistics, Andrew Ng - YouTube


Transcript:
 [NOISE] Okay. everyone. morning. Welcome to CS230, Deep Learning. so many of you know that, Deep Learning these days is the latest hardest area of computer science or AI. arguably, Deep Learning is the latest hardest area of, you know, all of human activity, maybe. but this is called CS230 Deep Learning where we hope that we can help you understand the state of the art and become experts at building and applying Deep Learning systems.
 unlike many Stanford classes, this class will be more interactive than, than others, because in this class we offer in the flipped classroom format where we'll ask you to watch a lot of the videos at home, a lot of the deeplearning.AI content hosted on Coursera, thus preserving the classroom in discussion section time for much deeper discussions.
 so to get started, let me, let me first, introduce our teaching team. So, the

USER >>  company. and then, did something similar, Baidu in China or its Chine- you know, which was headquartered in China which kind of helped Baidu go from al- also what was already a great company into today. You know, many people say China's is greatest AI company.
 and I think through work on many projects at Google, many projects at Baidu. And now leading Landing AI helping many companies on many projects and running around to different companies and see many different machine learning projects they have. I think I've been fortunate to learn a lot of lessons, not just about the technical aspects of machine learning, but about the practical know-how aspects of machine learning.
 And, if you, and- and I think that, what you can learn from, you know, the Internet or from a purely academic sources or from reading research papers is a lot of the technical aspects of machine learning and deep learning. But, there are a lot of other practical aspects of how to get these algorithms to wo

USER >> s.
 so, just wrap up with, two last thoughts. I think that, one of the things that excites me these days is I'm hoping, you know, I, I wanna share with you one of the lessons I learned, right? watching the rise of AI in multiple companies and spent lot of time thinking about, you know, what is it that makes a great AI company.
 And one of the lessons I learned, was really, hearing Jeff Bezos speak about what is it that makes for an internet company, right? And I think a lot of lessons, that we learned with the rise of the Internet will be useful, you know, and Internet was maybe one of the last major technological ways of disruption.
 And just as it has a great time to start working on the Internet maybe 20 years ago. I think today is a great time to start working on AI or deep learning. And so, is there a way to turn on the lights on this side as well? Do I- do I control that? [NOISE] Oh, thank you. Oh, thanks again. Great. So, so, I wanna show you one of the lessons I learned

USER >>  things, if you take any two of these classes at the same time. CS229 is machine learning, is the most mathematical of these classes and we go much more, CS229 goes much more into the mathematical derivations of the algorithms.
 CS229A is applied machine learning, is much less mathematical but spends a bit more time on the practical aspects. Is actually the easiest on ramp to machine learning as well as the least mathematical of these classes. CS230 is somewhere in between. It's, it's a bit more, is more mathematical than CS229A, less mathematical than CS230.
 But where CS230 focuses on is on deep learning, which is just one small subset of machine learning but it is the hardest subset of machine learning. Whereas there are a lot of other machine learning algorithms, from your PCA, K-means recommender systems, support vector machines that are also very useful, that I use, you know in my work quite frequently, that we don't teach in CS230 but then it's taught in CS229 and CS229A

ASSISTANT >> Understood

USER >>  are different is, because we wanna allow you to have late days, and Coursera was not built for late days, so we, we put the deadlines later on, on Coursera to allow you to submit even if you, you wanna use a late day. Does that make sense? Okay. So we are also using a, a kind of interactive, - this, this is gonna start course two.
 We, we will use, an interactive, tool that is called Mentimeter, to check in attendance in class and also, for you to answer some interactive questions. So, it's gonna start, next, next week. Sorry, not course two. regarding the grading formula, here it is. So, you have a small part on attendance, that is two percent of the final grade, eight percent on quizzes, 25 percent on programming assignments, and, big part on the midterm, and on the final projects.
 so this is posted on the website, if you want to check it. attendance is taken for in-class lectures for, 15 minutes TA meetings and for the TA sections on Friday. You ca

In [93]:
send_prompt(THREAD_ID, ASSISTANT_ID, prompt_last)

USER >> 
    I have now provided all 5 parts of the transcript. Please create a 30 Q&A datasets focused on the principles of AI, relevant mathematics, and practical applications based on the lecture.
    
    Make sure the questions cover both theoretical knowledge and how the concepts can be applied in real-world scenarios.
    
    Format the response strictly in JSON, using the following structure:

    [
      {
        "Question": "",
        "Answer": ""
      },
      {
        "Question": "",
        "Answer": ""
      },
      ...
    ]


ASSISTANT >> ```json
[
    {
        "Question": "What is the main reason deep learning has taken off recently?",
        "Answer": "The main reason is the scale of data and computation. The digitalization of society has generated vast amounts of data, and advancements in GPU computing have enabled training of very large neural networks."
    },
    {
        "Question": "Why is deep learning considered the 'hardest' area of machine learning?

In [94]:
response = get_response(THREAD_ID).data[-1:]
json_string = response[0].content[0].text.value

clean_string = json_string.replace('```json', '').replace('```', '').strip()
json_data = json.loads(clean_string)
json_data

[{'Question': 'What is the main reason deep learning has taken off recently?',
  'Answer': 'The main reason is the scale of data and computation. The digitalization of society has generated vast amounts of data, and advancements in GPU computing have enabled training of very large neural networks.'},
 {'Question': "Why is deep learning considered the 'hardest' area of machine learning?",
  'Answer': 'Deep learning is considered the hardest because it involves complex models like neural networks that can handle and extract patterns from large-scale data more effectively than traditional algorithms.'},
 {'Question': 'What is the flipped classroom format used in CS230?',
  'Answer': 'In the flipped classroom format, students watch video lectures and complete online exercises at home, reserving classroom time for deeper discussions and interactive learning sessions.'},
 {'Question': 'Who are the co-instructors of CS230?',
  'Answer': 'The co-instructors are Andrew Ng and Kian Katanforosh, 

# Json to CSV

In [87]:
import csv

csv_file_path = './dataset/csv/lecture2_QnA.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Question", "Answer"])
    
    writer.writeheader()
    
    for row in json_data:
        writer.writerow(row)