# GPT Model for Question Answering

## Task

A given dataset showing the students' grades in a subject will require a recommender system that will generate ways to salvage their grade point averages.

1. If the student receives 75% as an overall average:
    - Recommend ways to increase the grade based on the **lowest** grades in their chapters
2. If the student receives anything below 75%:
    - Identify the **lowest** grade subjects
    - Recommend ways to increase the grade based on the lowest grades in the chapters of the subjects.

In [6]:
import pandas as pd
import os

# Define the folder containing the Excel files
folder_path = r"S:\AI NextGen CQI\AI_CQI\Datasets"  # Change this to your actual folder path

# Get all files in the folder
files = os.listdir(folder_path)

# Dictionary to store cleaned DataFrames
cleaned_data = {}

# Lists for generated questions & answers
question_bank = []
answer_bank = []

# Loop through all Excel files in the folder
for file in files:
    if file.endswith(".xlsx") or file.endswith(".xls"):  # Check if it's an Excel file
        file_path = os.path.join(folder_path, file)  # Full path to the file
        print(f"Processing: {file}")

        try:
            # Read the summary sheet (sheet 5)
            summary = pd.read_excel(file_path, sheet_name=5)

            # Data Cleaning: Drop unwanted columns & rows
            summary = summary.drop(['Technological University of the Philippines - Manila', 'Unnamed: 1'], axis=1).drop(list(range(0, 11)), axis=0)

            # Set first two rows as column headers (multi-level)
            summary.columns = summary.iloc[0] + " " + summary.iloc[1].fillna("")

            # Drop first two rows & keep first 8 columns
            summary = summary.iloc[2:, 0:8].reset_index(drop=True).dropna()

            # Clean column names
            summary.columns = summary.columns.str.strip().str.lower().str.replace(" ", "_")

            # Store cleaned summary in dictionary
            cleaned_data[file] = summary

            # Generate questions & answers for each student
            for student_id in summary['student_number']:
                # Calculate CLO Score (Lower than 60)
                CLO_Score = summary[summary['student_number'] == student_id][['prelim_grade', 'midterm_grade', 'final_grade']].mean(axis=1).values[0] * 0.3
                question_bank.append(f"Given the CLO of {CLO_Score:.2f}, what can you say about the student's performance in the course?")
                if CLO_Score >= 60:
                    answer_bank.append("The score is satisfied. The students may be able to improve the areas with their lowest scores to increase the overall CLO score.")
                else:
                    answer_bank.append("The score doesn't meet the requirement. The students should therefore improve their lowest scoring exams and those below the value of 60.")

                # Calculate CLO Score (Higher than 60)
                CLO_Score = CLO_Score + 60
                question_bank.append(f"Given the CLO of {CLO_Score:.2f}, what can you say about the student's performance in the course?")
                if CLO_Score >= 60:
                    answer_bank.append("The score is satisfied. The students may be able to improve the areas with their lowest scores to increase the overall CLO score.")
                else:
                    answer_bank.append("The score doesn't meet the requirement. The students should therefore improve their lowest scoring exams and those below the value of 60.")

        except Exception as e:
            print(f"Error processing {file}: {e}")

# Example: Display generated questions and answers
print("\nSample Questions and Answers:")
for i in range(min(5, len(question_bank))):  # Show only first 5
    print(f"Q: {question_bank[i]}")
    print(f"A: {answer_bank[i]}\n")

# Save questions & answers to a CSV file (optional)
qa_df = pd.DataFrame({"Question": question_bank, "Answer": answer_bank})
qa_df.to_csv("Generated_Questions_Answers.csv", index=False)
print("Questions & Answers saved to 'Generated_Questions_Answers.csv'.")


Processing: AdMath - 1.xlsx
Processing: AdMath - 10.xlsx
Processing: AdMath - 2.xlsx
Processing: AdMath - 3.xlsx
Processing: AdMath - 4.xlsx
Processing: AdMath - 5.xlsx
Processing: AdMath - 6.xlsx
Processing: AdMath - 7.xlsx
Processing: AdMath - 8.xlsx
Processing: AdMath - 9.xlsx
Processing: AdMath L - 1.xlsx
Processing: AdMath L - 10.xlsx
Processing: AdMath L - 2.xlsx
Processing: AdMath L - 3.xlsx
Processing: AdMath L - 4.xlsx
Processing: AdMath L - 5.xlsx
Processing: AdMath L - 6.xlsx
Processing: AdMath L - 7.xlsx
Processing: AdMath L - 8.xlsx
Processing: AdMath L - 9.xlsx
Processing: CAD - 1.xlsx
Processing: CAD - 10.xlsx
Processing: CAD - 2.xlsx
Processing: CAD - 3.xlsx
Processing: CAD - 4.xlsx
Processing: CAD - 5.xlsx
Processing: CAD - 6.xlsx
Processing: CAD - 7.xlsx
Processing: CAD - 8.xlsx
Processing: CAD - 9.xlsx
Processing: CIRC2 - 1.xlsx
Processing: CIRC2 - 10.xlsx
Processing: CIRC2 - 2.xlsx
Processing: CIRC2 - 3.xlsx
Processing: CIRC2 - 4.xlsx
Processing: CIRC2 - 5.xlsx
Proc

# Q and A Proper


In [7]:
display(question_bank[-2])
display(answer_bank[-2])

"Given the CLO of 13.85, what can you say about the student's performance in the course?"

"The score doesn't meet the requirement. The students should therefore improve their lowest scoring exams and those below the value of 60."

# Transformer for Question Answering

In [8]:
# The following code defines new question bank and answer bank different from the previously defined ones.
# General List
course_list_lec = [
    "Fundamentals of ECE", 
    "Engineering Data Analysis (Lec)",
    "Differential Calculus",
    "Computer Aided Drafting",
    "Fundamentals of ICT",
    "Calculus 2", 
    "Computer Programming 1",
    "Materials Science and Engineering",
    "Physics 2 (Lec)",  
    "ECE Laws, Contracts, Ethics, Standards, & Safety", 
    "Electronics 1 (Lec)",
    "Differential Equation", 
    "Engineering Economics", 
    "Computer Programming 2", 
    "Circuits 1 (Lec)", 
    "Advanced Engineering Mathematics (Lec)", 
    "Electronics 2 (Lec)", 
    "Electromagnetics", 
    "Communications 1 (Lec)", 
    "Engineering Management", 
    "Circuits 2 (Lec)", 
    "Digital Electronics 1 (Lec)", 
    "Electronics 3 (Lec)",
    "Communications 2 (Lec)",  
    "Signals, Spectra, and Signal Processing (Lec)", 
    "Feedback and Control Systems (Lec)", 
    "Digital Electronics 2 (Lec)", 
    "Communications 3 (Lec)", 
    "Communications 4 (Lec)",
    "Technopreneurship 101", 
    "Environmental Science and Engineering",
    "ECE Elective 1 (Lec)",  
    "ECE Elective 2 (Lec)", 


]

course_list_lab = [
    "Engineering Data Analysis (Lab)",
    "Physics 2 (Lab)", 
    "Electronics 1 (Lab)",
    "Electronics 2 (Lab)",
    "Electronics 3 (Lab)", 
    "Circuits 1 (Lab)",
    "Circuits 2 (Lab)", 
    "Advanced Engineering Mathematics (Lab)", 
    "Digital Electronics 1 (Lab)",
    "Digital Electronics 2 (Lab)",
    "Communications 1 (Lab)",
    "Communications 2 (Lab)",
    "Communications 3 (Lab)",
    "Communications 4 (Lab)", 
    "ECE Elective 1 (Lab)", 
    "ECE Elective 2 (Lab)",
    "Feedback and Control Systems (Lab)",    
    "Signals, Spectra, and Signal Processing (Lab)",
    "Computer Programming 1 (Lab)",
    "Computer Programming 2 (Lab)",
]

# CLO Listing for Lecture Courses
# MATHEMATICS SUBJECTS
EDA_CLO = {"CLO1": "Obtaining Data, Statistical Sampling, and Sampling Distributions", "CLO2": "Point Estimation of Parameters, Probability Distributions, and Statistical Intervals", "CLO3": "Hypothesis Testing, Regression and Correlation, and Design of Experiments"}
CALC1_CLO = {"CLO1": " Functions, Continuity, and Limits", "CLO2": "Derivatives and Its Applications, and Higher-Order Derivatives", "CLO3":"Parametric Equations and Partial Differentiation"}
CALC2_CLO = {"CLO1": "Integration Concept and Formulas, and Integration Techniques", "CLO2": "Improper Integrals", "CLO3": "Multiple Integration and Applications"}
DE_CLO =  {"CLO1": "First Order, First Degree ODE, and Its Applications", "CLO2": "Higher-Order ODE and Its Applications", "CLO3": " Laplace Transforms, Inverses, and Its Applications"}
ADMATH_CLO = {"CLO1": "Simultaneous Linear and Nonlinear Equations, Complex Numbers and Its Applications", "CLO2": "Power Series, Bessel, Legendre, Fourier Series, and Applications", "CLO3": "Ordinary and Partial Differential Equations"}
EMAGS_CL0 = {"CLO1": "Vector Analysis", "CLO2": "Directional Derivative, Gradient, Divergence, Curl, Integral Theorems", "CLO3": "Electric and Magnetic Fields, Dielectric and Magnetic Materials, Coupled and Magnetic Circuits, Time-varying Fields and Maxwell's Equation"}

# ELECTRONICS SUBJECTS
ELECS1_CLO = {"CLO1": "Diode and Voltage Multipliers", "CLO2": "JT and FET", "CLO3": " BJT and FET Small Signal Analysis" }
ELECS2_CLO = {"CLO1": " BJT and FET Frequency Response", "CLO2": " Cascade and Cascode Connections, Current Mirrors and Current Sources", "CLO3": "Differential Amplifiers, Operational Amplifiers, Feedback Systems, Ocillators, and Filters" }
ELECS3_CLO = {"CLO1": "SCR, UJT, PUT, TRIAC, DIAC, and other Thyristors, and Optoelectronic Devices", "CLO2": "Sensors and Transducers, and Interfacing Techniques", "CLO3": "PLC and Building Management Systems" }
CIRCUITS1_CLO = {"CLO1": "Resistive Network, Mesh and Node Equations", "CLO2": "Network Theorems", "CLO3": "Transient Analysis" }
CIRCUITS2_CLO = {"CLO1": "Impedance and Admittance, Resonace", "CLO2": " Power in AC Circuits, Solutions to AC Netwok Problems", "CLO3": " Two-Port Network Parameters and Transfer Function" }
DIGI1_CLO = {"CLO1": "Boolean Algebra and Logic Gates", "CLO2": "Minimization of Combinational Logic Circuits", "CLO3": "Algorithmic State Machine and Asynchronous Sequential Logic" }
DIGI2_CLO = {"CLO1": "Microprocessor Unit and Memory Subsystem", "CLO2": " I/O Subsystem and Introduction to Set Architecture and Assembly Programming", "CLO3": "Microcontrollers" }
CONSYS_CLO = {"CLO1": "Pole-Zero Determination, System Modeling and Transfer Function", "CLO2": " LTI Systems and Transient Response, Block Diagram, and Signal Flow Diagram", "CLO3": " Poles and Zeros, Root Locus, and Stability Analysis, Steady State Analysis and Frequency Response"}

# COMMUNICATIONS SUBJECTS
COMMS1_CLO = {"CLO1": " Introduction to Communication Systems, Noise and dB Calculations", "CLO2": "AM, SSB Techniques, FM, Radio Receivers", "CLO3": " Radiation and Propagation Waves, Pulse Modulation, Digital Modulation, and Broadband Communication System"}
COMMS2_CLO = {"CLO1": "Basic Information Theory, Error Detection, Digital Communication", "CLO2": "ASK, FSK, PSK, QAM", "CLO3": "Digital Transmission, Multiplexing, Frequency and Time Division Multiplexing" }
COMMS3_CLO = {"CLO1": "Introdution to Data Communications", "CLO2": " Category of Data Communications", "CLO3": "Configurations and Network Topology" }
COMMS4_CLO = {"CLO1": "Transmission Line, Matching Transmission Lines, and Smith Chart", "CLO2": "Radio Wave Propagation, Power Density, and Field Strength Calculations", "CLO3": "Antenna Systems, Wave Guides, and Fiber Optics" }
SIGNALS_CLO = {"CLO1": " Classification and Characteristics of Signals, Sampling Theorem and Aliasing", "CLO2": "Convolution, Correlation, Fourier Series and Transform, Z-Transform", "CLO3": " Filtering and Difference Equations for FIR and IIR Filters" }


# GENERAL ENGINEERING SUBJECTS AND GENERAL ECE SUBJECTS
PECEP1_CLO  = {"CLO1": " Algebra, Analytical and Solid Geometry, and Trigonometry", "CLO2": "Basic Electronic Circuits", "CLO3":"Basic Circuit Analysis"}
CAD_CLO = {"CLO1": "Introduction to CAD and Its Environment", "CLO2": " Snapping and Construction Elements", "CLO3":" Dimensioning, Plotting, and Inputting"}
PECEP2_CLO = {"CLO1": " Introduction to Web Development", "CLO2": " Front-end Development", "CLO3": "Back-end Development"}
CP1_CLO = {"CLO1": " Introduction to Computers (Hardware and Software)", "CLO2": "History and Evolution of Computers, Algorithms", "CLO3": "Introduction to Computer Programming"}
MATSCIE_CLO = {"CLO1": "Modern Materials, Atomic Structure, and Interatomic Bonding, Crystaaline and Non-crystalline Materials, Metals, and Alloys", "CLO2": "Ceramics, Polymer Structures and Properties, Composites", "CLO3": "Electrical Properties, Dielectric Behavior, Magnetic Properties, Optical Properties, Environmental and Societal Issues in Materials Science and Engineering"}
PHYSICS2_CLO = {"CLO1": " Thermodynamic, Atomic/Nuclear, and Condesed Matter", "CLO2": "Electricity, Magnetism, and Electromagnetic Induction, Inductance, and Alternating Circuits", "CLO3": "Optics"}
LAWS_CLO = {"CLO1":"Fundamentals of the Laws, Obligations, and Contracts, Regulation of ECE Profession, Practicing of the ECE Profession, and RA 9292", "CLO2": "Other ECE Relates Statutes such as NTC Memorandum Orders, IECEP, and PECs", "CLO3": "afety Standards such as safety procedures in high risk activities and industries, and incident investigation and reporting"}
ECON_CLO = {"CLO1": "Introduction to Engineering Economics and Money-Time Relationship and Equivalence", "CLO2": "Basic Economy Study Methods", "CLO3": "Decisions Under Certainty, Decisions Recognizing Risk, and Decisions Admitting Uncertainty"}
TECHNO_CLO = {"CLO1": "Technopreneurship Introduction, Customers, and Value", "CLO2": "Proposition, Ethics, Social Responsibilitym and Globalization", "CLO3": " Business Models and Introduction to Intellectual Property, Execution and Business Plan, Financial Analysis and Accounting Basics, and Raising Capital"}
ENGMAN_CLO = {"CLO1": "Evolution of Management Theory, Management and Its Function", "CLO2": "Planning, Leading, Organizing, and Cotrolling", "CLO3": "Managing Product and Service Operations, and Managing the Marketing Function and Finance Function"}
CP2_CLO = {"CLO1": "Introduction to Embedded Systems, Hardware and Software Evolution, and Microprocessors and Microcontrollers", "CLO2": "Arduino Environment", "CLO3": "Raspberry Pi Environment"}
ENVISCI_CLO = {"CLO1": "Nature and Ecology, and Natural Systems and Resources", "CLO2": "Environmental Concerns and Crises, Environmental Impact Assessment", "CLO3": "Sustainable Development"}
ELECTIVE1_CLO = {"CLO1": "Refer to your elective topic.", "CLO2": "Refer to your elective topic.", "CLO3": "Refer to your elective topic."}
ELECTIVE2_CLO = {"CLO1": "Refer to your elective topic.", "CLO2": "Refer to your elective topic.", "CLO3": "Refer to your elective topic."}



# Generate Question Bank
question_bank = [
    f"If my score in {course} is below 60%, what can you recommend to increase the CLO score?"
    for course in course_list_lab + course_list_lec
]

# Generate Answer Bank
answer_bank = [
    f"If the students are struggling with {course}, these are the tips to help them improve:\n"
    "1. Solidify their theoretical knowledge from the lecture discussions.\n"
    "2. Give them off-the-class activities to enhance hands-on skills essential for the laboratory activities.\n"
    "3. Criticize their laboratory reports well to help them improve with laboratory report writing.\n"
    "4. Allow them to work with peers.\n"
    "5. Make sure that the students always submit works on time."
    for course in course_list_lab
]

answer_bank += [
    f"Here's how you can enhance the performance of students in {course}:\n"
    "1. Provide additional learning materials that are fun and interactive (e.g., free online courses).\n"
    "2. Allow them to consult regarding their performance within a given time period.\n"
    "3. Give them more practice problems to deepen their understanding in:\n"
    f"   - {', '.join(CLO.values())}"
    for course, CLO in {
        "Engineering Data Analysis": EDA_CLO, "Differential Calculus": CALC1_CLO, "Calculus 2": CALC2_CLO, 
        "Differential Equation": DE_CLO, "Advanced Engineering Mathematics": ADMATH_CLO, "Electromagnetics": EMAGS_CL0,
        "Electronics 1": ELECS1_CLO, "Electronics 2": ELECS2_CLO, "Electronics 3": ELECS3_CLO,
        "Circuits 1": CIRCUITS1_CLO, "Circuits 2": CIRCUITS2_CLO, "Digital Electronics 1": DIGI1_CLO, "Digital Electronics 2": DIGI2_CLO,
        "Feedback and Control Systems": CONSYS_CLO, "Communications 1": COMMS1_CLO, "Communications 2": COMMS2_CLO,
        "Communications 3": COMMS3_CLO, "Communications 4": COMMS4_CLO, "Signals, Spectra, and Signal Processing": SIGNALS_CLO, "Fundamentals of ECE": PECEP1_CLO, "Fundamentals of ICT": PECEP2_CLO, "Computer Aided Drafting": CAD_CLO, 
        "Computer Programming 1": CP1_CLO, "Computer Programming 2": CP2_CLO, "Materials Science and Engineering": MATSCIE_CLO,
        "Physics 2": PHYSICS2_CLO, "ECE Laws, Contracts, Ethics, Standards, & Safety": LAWS_CLO, "Engineering Economics": ECON_CLO,
        "Technopreneurship 101": TECHNO_CLO, "Engineering Management": ENGMAN_CLO, "Environmental Science and Engineering": ENVISCI_CLO,
        "ECE Electives 1": ELECTIVE1_CLO, "ECE Electives 2": ELECTIVE2_CLO
    }.items()
]



In [9]:
QA_pair = [(question_bank[i],answer_bank[i]) for i in range(0,len(question_bank))] * 10
print(len(QA_pair))

530


In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader

# Define a simple dataset for training
class SimpleDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length)

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
    

# Load pretrained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

dataset = SimpleDataset(QA_pair, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Set model to training mode
model.train()

# Training loop (mock training)
for epoch in range(20):
    print(f"Epoch {epoch+1}/10")
    for batch in dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        

        print(f"Loss: {loss.item()}")  # Print loss for monitoring

# Save the trained model
model.save_pretrained("trained_gpt2_model")
tokenizer.save_pretrained("trained_gpt2_model")  # Save the tokenizer as well


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1/10
Loss: 4.341828346252441
Loss: 3.839312791824341
Loss: 3.50642728805542
Loss: 3.1969430446624756
Loss: 3.0617403984069824
Loss: 3.0853140354156494
Loss: 2.687647581100464
Loss: 2.178331136703491
Loss: 2.062206506729126
Loss: 2.1633167266845703
Loss: 1.7907960414886475
Loss: 2.1358931064605713
Loss: 2.128115653991699
Loss: 1.6462559700012207
Loss: 1.9246402978897095
Loss: 1.6424267292022705
Loss: 1.6357505321502686
Loss: 1.6356399059295654
Loss: 1.164262294769287
Loss: 1.463477611541748
Loss: 1.0230739116668701
Loss: 1.282071828842163
Loss: 1.1370865106582642
Loss: 0.9923839569091797
Loss: 1.1278736591339111
Loss: 1.2451562881469727
Loss: 0.9631757140159607
Loss: 0.8078498244285583
Loss: 0.7937136292457581
Loss: 0.934597373008728
Loss: 0.6575653553009033
Loss: 0.8214573264122009
Loss: 0.9924315810203552
Loss: 0.963103711605072
Loss: 0.7874841094017029
Loss: 0.8237584829330444
Loss: 0.5376525521278381
Loss: 0.8896641135215759
Loss: 0.6314139366149902
Loss: 0.9352530241012573
Lo

KeyboardInterrupt: 

# Evaluation Loop

In [None]:
# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("trained_gpt2_model")
tokenizer = GPT2Tokenizer.from_pretrained("trained_gpt2_model")
model.eval()

def ask_question(question):
    input_text = f"Question: {question} Answer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=120,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    answer = tokenizer.decode(output[0], skip_special_tokens=False)
    return answer.split("Answer:")[-1].strip()  # Return the generated answer

# Example questions
print(ask_question("If my score in ECE Elective 2 (Lec) is below 60%, what can you recommend to increase the CLO score?"))

In [None]:
# Example questions
print(ask_question("If my score in ECE Elective 2 (Lab) is below 60%, what can you recommend to increase the CLO score?"))

In [None]:
print(ask_question(f'If my score in Environmental Science and Engineering is below 60%, what can you recommend to increase the CLO score?'))