In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from nltk.tokenize import TextTilingTokenizer
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


### navigate to where your textbook is

In [None]:
os.chdir('/content/drive/My Drive')

### The format of textbook.csv is just one chapter copied into each row. So this is n x 1 with no header in the first cell. First cell = first chapter.

In [None]:

df = pd.read_csv("textbook.csv", header = None)
print(len(df))

16


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
!pip install --quiet --upgrade google-generativeai
import google.generativeai as genai
from tqdm import tqdm
import os

### Insert Gemini API Key and select model

In [None]:
GEMINI_API_KEY = ""
genai.configure(api_key=GEMINI_API_KEY)


model = genai.GenerativeModel("gemini-2.0-flash")

### In case extraction does not run through (e.g your notebook stops prematurely after 20 minutes), save all the outputs as txt files in a directory so if the txt exists it does not need to be regenerated.



In [None]:
result_directory = "/content/drive/MyDrive/gemini_labels_KClist_new"
os.makedirs(result_directory, exist_ok=True)

### Feel Free to Edit Prompt as you see fit

In [None]:

def get_kc_label(segment, chapter_idx, segment_idx):
    filename = os.path.join(result_directory, f"{chapter_idx}_{segment_idx}_kc.txt")

    if os.path.exists(filename):
        return  # Already processed

    print(f"Processing Chapter {chapter_idx}, Segment {segment_idx}...")

    prompt = f"Summarize in 1–4 words the knowledge component described in this text:\n\n\"\"\"\n{segment}\n\"\"\""

    sleep(5)  # Prevent rate limit

    try:
        response = model.generate_content(prompt)
        label = response.text.strip()
    except Exception as e:
        print(f"Error at Chapter {chapter_idx}, Segment {segment_idx}: {e}")
        label = "error"

    with open(filename, "w") as f:
        f.write(label)

In [None]:
## chunks text into appropriate kcs through topic segmentation
def process_all_chapters(df):
    for chapter_idx in range(len(df)):
        chapter_text = df.iloc[chapter_idx, 0]

        try:
            segments = tt.tokenize(chapter_text)
        except Exception as e:
            print(f"Skipping Chapter {chapter_idx} due to error: {e}")
            continue

        for segment_idx, segment in enumerate(segments):
            get_kc_label(segment.strip(), chapter_idx, segment_idx)

In [None]:
process_all_chapters(df)

Processing Chapter 0, Segment 0...
Processing Chapter 0, Segment 1...
Processing Chapter 0, Segment 2...
Processing Chapter 0, Segment 3...
Processing Chapter 0, Segment 4...
Processing Chapter 0, Segment 5...
Processing Chapter 0, Segment 6...
Processing Chapter 1, Segment 0...
Processing Chapter 1, Segment 1...
Processing Chapter 1, Segment 2...
Processing Chapter 1, Segment 3...
Processing Chapter 1, Segment 4...
Processing Chapter 1, Segment 5...
Processing Chapter 1, Segment 6...
Processing Chapter 1, Segment 7...
Processing Chapter 2, Segment 0...
Processing Chapter 2, Segment 1...
Processing Chapter 2, Segment 2...
Processing Chapter 2, Segment 3...
Processing Chapter 2, Segment 4...
Processing Chapter 2, Segment 5...
Processing Chapter 2, Segment 6...
Processing Chapter 2, Segment 7...
Processing Chapter 2, Segment 8...
Processing Chapter 2, Segment 9...
Processing Chapter 3, Segment 0...
Processing Chapter 3, Segment 1...
Processing Chapter 3, Segment 2...
Processing Chapter 3

In [None]:
import os
import pandas as pd
from nltk.tokenize import TextTilingTokenizer
## w and k are hyperparameters that govern kc fineness
tt = TextTilingTokenizer(w=50, k=170)
result_directory = "/content/drive/MyDrive/gemini_labels_KClist_new"

rows = []

for chapter_idx in range(len(df)):
    chapter_text = df.iloc[chapter_idx, 0]

    try:
        segments = tt.tokenize(chapter_text)
    except Exception as e:
        print(f"Skipping Chapter {chapter_idx} due to error: {e}")
        continue

    for segment_idx, segment in enumerate(segments):
        label_path = os.path.join(result_directory, f"{chapter_idx}_{segment_idx}_kc.txt")

        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                kc_label = f.read().strip()
        else:
            kc_label = "missing"

        rows.append({
            "chapter_num": chapter_idx,
            "segment_text": segment.strip(),
            "kc_label": kc_label
        })

# Save CSV
output_csv_path = os.path.join(result_directory, "all_chapter_segments_with_kc.csv")
pd.DataFrame(rows).to_csv(output_csv_path, index=False)

### This second run checks chunks are relevant for code and lets you filter out parts of textbook that shouldn't be a kc (e.g. history of the name "python")

In [None]:
from time import sleep

def is_relevant_to_solving(prompt_text):
    prompt = f"""
You are helping filter textbook content to assist students working on programming assignments.

Here is a piece of text. Only answer with 0 or 1. Is it directly helpful for writing a specific piece of code (e.g., solving a homework problem)?

- Answer **1** if the text contains a clear, concise explanation of a concept, coding principle, or logic that would directly help a student write or debug code for a specific problem.
- Answer **0** if the text provides general background, historical context, vague conceptual discussion, mismatched code/text, or anything not immediately actionable for solving a programming task.

Text:
\"\"\"{prompt_text}\"\"\"
"""
    try:
        sleep(5)
        response = model.generate_content(prompt)
        answer = response.text.strip().lower()

        if "1" in answer:
            print("1")
            return 1
        elif "0" in answer:
            print("0")
            return 0

        else:
            return -1  # unclear result
    except Exception as e:
        print(f"Error: {e}")
        return -1

kc_df = pd.read_csv(os.path.join(result_directory, "all_chapter_segments_with_kc.csv"))

relevant_flags = []

for i, row in kc_df.iterrows():
    segment = row['segment_text']
    print(f"Checking relevance for row {i}...")
    relevant = is_relevant_to_solving(segment)
    relevant_flags.append(relevant)

kc_df['relevant'] = relevant_flags

kc_df.to_csv(os.path.join(result_directory, "all_chapter_segments_with_kc_and_relevance.csv"), index=False)
print("✅ Relevance column added and CSV saved.")


Checking relevance for row 0...
0
Checking relevance for row 1...
0
Checking relevance for row 2...
1
Checking relevance for row 3...
0
Checking relevance for row 4...
1
Checking relevance for row 5...
0
Checking relevance for row 6...
1
Checking relevance for row 7...
0
Checking relevance for row 8...
1
Checking relevance for row 9...
0
Checking relevance for row 10...
1
Checking relevance for row 11...
1
Checking relevance for row 12...
1
Checking relevance for row 13...
0
Checking relevance for row 14...
1
Checking relevance for row 15...
1
Checking relevance for row 16...
1
Checking relevance for row 17...
1
Checking relevance for row 18...
1
Checking relevance for row 19...
1
Checking relevance for row 20...
1
Checking relevance for row 21...
1
Checking relevance for row 22...
0
Checking relevance for row 23...
1
Checking relevance for row 24...
1
Checking relevance for row 25...
0
Checking relevance for row 26...
1
Checking relevance for row 27...
1
Checking relevance for row 28.

In [None]:
#filters a final csv with only relevant items

input_csv = os.path.join(result_directory, "all_chapter_segments_with_kc_and_relevance.csv")
df = pd.read_csv(input_csv)

relevant_df = df[df['relevant'] == 1].copy()

output_csv = os.path.join(result_directory, "relevant_chapter_segments_only.csv")
relevant_df.to_csv(output_csv, index=False)

print(f"Saved filtered CSV with {len(relevant_df)} relevant segments to:\n{output_csv}")

✅ Saved filtered CSV with 150 relevant segments to:
/content/drive/MyDrive/gemini_labels_KClist_new/relevant_chapter_segments_only.csv
