<a href="https://colab.research.google.com/github/alidiusk/cs2731-final-project/blob/main/syntheticNLPfinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS 2731 Final Project Notebook

Below is a collection of code chunks and text used to perform data exploration, 
model training, and evalutation metrics.

This notebook is a collaboration between...
* Simon Schueller - sps67
    /;'
* Lichang Chen - lic138
    * [List contributions here]
* Daniel Sokolowski - das369
    * Load human-made sciq datasets
    * Added Q&A formatting fixes: 
        * Drop NaN's
        * Drop placeholders
        * Remove periods
        * Replace pipes with commas
        * Extracted duplicate questions
    * Shuffle answers and distractors together
    * Pair up MC letters & answers 
    * Implemented T5 pre-trained model loading 
    * Implemented T5 model fine-tuning
    * Implemented T5 model predictions & recording
    * Implemented T5 model prediction performance metrics calculations

## 0.1 Library imports

In [2]:
# Import the necessary libraries
import pandas as pd
import json
import numpy as np

! pip install datasets --q
from datasets import load_dataset

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

##0.2 Load sciq (human-made) data

In [3]:
# Load the sciq datasets from the Huggingface.co website
sciq_df = load_dataset("sciq")

sciq_train_df = pd.DataFrame(sciq_df['train'])
sciq_validation_df = pd.DataFrame(sciq_df['validation'])
sciq_test_df = pd.DataFrame(sciq_df['test'])

# Rename columns to common mapping to reuse shuffle/preprocessing fucntions for 
# synthetic and sciq datasets
new_names = {'question': 'Question', 'distractor3': 'Distractor3', 'distractor1': 'Distractor1', 'distractor2': 'Distractor2', 'correct_answer': 'Answer'}
sciq_train_df = sciq_train_df.rename(columns=new_names)
sciq_validation_df = sciq_validation_df.rename(columns=new_names)
sciq_test_df = sciq_test_df.rename(columns=new_names)

Downloading builder script:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading and preparing dataset sciq/default to /root/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493...


Downloading data:   0%|          | 0.00/2.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11679 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset sciq downloaded and prepared to /root/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
print(sciq_train_df)

                                                Question      Distractor3  \
0      What type of organism is commonly used in prep...          viruses   
1      What phenomenon makes global winds blow northe...  tropical effect   
2      Changes from a less-ordered state to a more-or...      endothermic   
3         What is the least dangerous radioactive decay?       zeta decay   
4      Kilauea in hawaii is the world’s most continuo...            magma   
...                                                  ...              ...   
11674  The enzyme pepsin plays an important role in t...           lipids   
11675  What remains a constant of radioactive substan...          acidity   
11676  Terrestrial ecosystems, also known for their d...       substrates   
11677  High explosives create shock waves that exceed...       turbulence   
11678  What do you call a structure composed of two o...           system   

            Distractor1         Distractor2                Answer  \
0     

## 1.1 Load synthetic data

In [5]:
url = 'https://raw.githubusercontent.com/alidiusk/cs2731-final-project/main/data/sorted-questions.csv?token=GHSAT0AAAAAAB6T4654GM2APAJNT77F5VCEZA6AJPA'
synthetic_df = pd.read_csv(url)
# Dataset is now stored in a Pandas Dataframe
synthetic_df.head(5)

print(synthetic_df.shape)

(6131, 6)


## 1.2 Data Format Cleaning

##### 1.2.1 Drop NaN Topic, Question, Answer, Distractor1, Distractor2, Distractor3

In [6]:
# Check for NaN Topics, Questions, Answers, Distractor1, Distractor2, Distractor3:
# (Useful printout to view faulty ChatGPT questions)
print('Missing Topic:')
print(synthetic_df[pd.isna(synthetic_df['Topic'])])
print('-------------------------------------------------')
print('Missing Question:')
print(synthetic_df[pd.isna(synthetic_df['Question'])])
print('-------------------------------------------------')
print('Missing Answer:')
print(synthetic_df[pd.isna(synthetic_df['Answer'])])
print('-------------------------------------------------')
print('Missing Distractor1:')
print(synthetic_df[pd.isna(synthetic_df['Distractor1'])])
print('-------------------------------------------------')
print('Missing Distractor2:')
print(synthetic_df[pd.isna(synthetic_df['Distractor2'])])
print('-------------------------------------------------')
print('Missing Distractor3:')
print(synthetic_df[pd.isna(synthetic_df['Distractor3'])])
print('-------------------------------------------------')

# Drop all questions w/ NaN Topic, Question, Answer, Distractor1, Distractor2, or Distractor3:
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Topic'])].index)
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Question'])].index)
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Answer'])].index)
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Distractor1'])].index)
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Distractor2'])].index)
synthetic_df = synthetic_df.drop(synthetic_df[pd.isna(synthetic_df['Distractor3'])].index)

Missing Topic:
Empty DataFrame
Columns: [Topic, Question, Answer, Distractor1, Distractor2, Distractor3]
Index: []
-------------------------------------------------
Missing Question:
                                                  Topic Question Answer  \
39                                           And so on.      NaN    NaN   
6099  The format requested cannot be displayed in a ...      NaN    NaN   

     Distractor1 Distractor2 Distractor3  
39           NaN         NaN         NaN  
6099         NaN         NaN         NaN  
-------------------------------------------------
Missing Answer:
                                                  Topic  \
39                                           And so on.   
232                                           Astronomy   
6099  The format requested cannot be displayed in a ...   

                         Question Answer Distractor1 Distractor2 Distractor3  
39                            NaN    NaN         NaN         NaN         NaN  
2

##### 1.2.2 Drop Duplicated Headers

In [7]:
# Some topics, questions, answers, and distractors 
# are literally just the words "Topic", " "Question", 
# "Answer", "Distractor", "Distracto1", "Distractor2", "Distractor3"
# (Probably an artifact from copy-and-pasting ChatGPT outputs into .csv file)


print('df[Topic] == "Topic"')
print(synthetic_df[synthetic_df['Topic'] == 'Topic'])
print('-------------------------------------------------')
print('df[Question] == "Question"')
print(synthetic_df[synthetic_df['Question'] == 'Question'])
print('-------------------------------------------------')
print('df[Answer] == "Answer"')
print(synthetic_df[synthetic_df['Answer'] == 'Answer'])
print('-------------------------------------------------')
print('df[Distrctor1] == "Distractor1"')
print(synthetic_df[synthetic_df['Distractor1'] == 'Distractor1'])
print('-------------------------------------------------')
print('df[Distrctor2] == "Distractor2"')
print(synthetic_df[synthetic_df['Distractor2'] == 'Distractor2'])
print('-------------------------------------------------')
print('df[Distrctor3] == "Distractor3"')
print(synthetic_df[synthetic_df['Distractor3'] == 'Distractor3'])
print('-------------------------------------------------')

# Drop all questions with placeholders "Topic", "Question", "Answer", 
# "Distractor1", "Distractor2", or "Distractor3":
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Topic'] == 'Topic'].index)
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Question'] == 'Question'].index)
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Answer'] == 'Answer'].index)
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Distractor1'] == 'Distractor1'].index)
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Distractor2'] == 'Distractor2'].index)
synthetic_df = synthetic_df.drop(synthetic_df[synthetic_df['Distractor3'] == 'Distractor3'].index)


df[Topic] == "Topic"
      Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6100  Topic  Question  Answer  Distractor1   Distractor  Distractor3
6101  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6102  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6103  Topic  Question  Answer  Distractor1   Distractor  Distractor3
6104  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6105  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6106  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6107  Topic  Question  Answer  Distractor1   Distractor  Distractor3
6108  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6109  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6110  Topic  Question  Answer  Distractor1  Distractor2  Distractor3
6111  Topic  Question  Answer  Distractor1   Distractor  Distractor3
-------------------------------------------------
df[Question] == "Question"
     

##### 1.2.3 Remove Ordered List Prefixes from Answers & Distractors

In [8]:
# Clean up distractors by removing ordered list prefixes

def remove_prefix(ans):
  # Ordered List potential formats
  possibilities = ["A) ", "A. ", "B) ", "B. ", "C) ", "C.", "D) ", "D. "]
  for pos in possibilities:

    # Remove all "X) " or "X. " from answers/distractors 
    if pos in ans:
      return str(ans).replace(pos, "")
  
  # Return original ans if no "X) " or "X. " to remove
  return ans

synthetic_df["Answer"] = synthetic_df["Answer"].apply(remove_prefix)
synthetic_df["Distractor1"] = synthetic_df["Distractor1"].apply(remove_prefix)
synthetic_df["Distractor2"] = synthetic_df["Distractor2"].apply(remove_prefix)
synthetic_df["Distractor3"] = synthetic_df["Distractor3"].apply(remove_prefix)
synthetic_df.head(5)


Unnamed: 0,Topic,Question,Answer,Distractor1,Distractor2,Distractor3
0,Anatomy,What is the part of the eye that detects color...,Cones,Rods,Retina,Optic nerve
1,Anatomy,Which bone is commonly referred to as the knee...,Patella,Femur,Tibia,Fibula
2,Anatomy,What is the name for the upper arm bone?,Humerus,Radius,Ulna,Femur
3,Anatomy,What part of the brain is responsible for regu...,Hypothalamus,Thalamus,Cerebellum,Medulla oblongata
4,Anatomy,What organ produces insulin in the human body?,Pancreas,Liver,Kidney,Heart


##### 1.2.4 Remove period suffixes in answers and distractors

In [9]:
# Some answers end with a period, but most don't:

print('Answer ends w/ period:')
print(synthetic_df[synthetic_df['Answer'].str.endswith('.')])
print('-------------------------------------------------')
print('Distractor1 ends w/ period:')
print(synthetic_df[synthetic_df['Distractor1'].str.endswith('.')])
print('-------------------------------------------------')
print('Distractor2 ends w/ period:')
print(synthetic_df[synthetic_df['Distractor2'].str.endswith('.')])
print('-------------------------------------------------')
print('Distractor3 ends w/ period:')
print(synthetic_df[synthetic_df['Distractor3'].str.endswith('.')])
print('-------------------------------------------------')

Answer ends w/ period:
          Topic                                 Question  \
42    Astronomy                         What is a comet?   
43    Astronomy                          What is a star?   
55    Astronomy                    What is a black hole?   
81    Astronomy                     What is a supernova?   
83    Astronomy                    What is a light-year?   
...         ...                                      ...   
5958    Physics              What is Pascal's principle?   
5967    Physics  What is the definition of a black hole?   
5971    Physics    What is Newton's first law of motion?   
5975    Physics                   What is Coulomb's law?   
6060    Physics              What is the law of inertia?   

                                                 Answer  \
42                                    A type of planet.   
43      A celestial object that produces its own light.   
55    A region of space with a gravitational field s...   
81          When a s

In [10]:
def remove_suffix(ans):

  # Drop period at the end of answers and distractors
  if ans[-1] == '.':
    return ans[:-1]
  
  # Return original ans if no ending period to remove
  return ans


# Remove period suffixes in answers and distractors
synthetic_df["Answer"] = synthetic_df["Answer"].apply(remove_suffix)
synthetic_df["Distractor1"] = synthetic_df["Distractor1"].apply(remove_suffix)
synthetic_df["Distractor2"] = synthetic_df["Distractor2"].apply(remove_suffix)
synthetic_df["Distractor3"] = synthetic_df["Distractor3"].apply(remove_suffix)
synthetic_df.head(5)

Unnamed: 0,Topic,Question,Answer,Distractor1,Distractor2,Distractor3
0,Anatomy,What is the part of the eye that detects color...,Cones,Rods,Retina,Optic nerve
1,Anatomy,Which bone is commonly referred to as the knee...,Patella,Femur,Tibia,Fibula
2,Anatomy,What is the name for the upper arm bone?,Humerus,Radius,Ulna,Femur
3,Anatomy,What part of the brain is responsible for regu...,Hypothalamus,Thalamus,Cerebellum,Medulla oblongata
4,Anatomy,What organ produces insulin in the human body?,Pancreas,Liver,Kidney,Heart


##### 1.2.5 Replace pipe characters "|" with commas

In [11]:
# Some answers & distractors have weird pipe "|" symbols in them,
# I think they're supposed to be commas

print('Answer has a pipe "|" in it:')
print(synthetic_df[synthetic_df['Answer'].str.contains('\|')])
print('-------------------------------------------------')
print('Distractor1 has a pipe "|" in it:')
print(synthetic_df[synthetic_df['Distractor1'].str.contains('\|')])
print('-------------------------------------------------')
print('Distractor2 has a pipe "|" in it:')
print(synthetic_df[synthetic_df['Distractor2'].str.contains('\|')])
print('-------------------------------------------------')
print('Distractor3 has a pipe "|" in it:')
print(synthetic_df[synthetic_df['Distractor3'].str.contains('\|')])
print('-------------------------------------------------')

Answer has a pipe "|" in it:
          Topic                                   Question  \
72    Astronomy                          What is a galaxy?   
84    Astronomy                          What is a meteor?   
87    Astronomy          What are the four jovian planets?   
127   Astronomy  What is the speed of light (in a vacuum)?   
138   Astronomy                       What is an asteroid?   
...         ...                                        ...   
5978    Physics           What is the third law of motion?   
6000    Physics    What is the speed of light in a vacuum?   
6003    Physics    What is the speed of light in a vacuum?   
6030    Physics                What is the speed of light?   
6060    Physics                What is the law of inertia?   

                                                 Answer  \
72    A system of stars| gas| and dust held together...   
84    A small| fast-moving celestial object that bur...   
87                     Jupiter| Saturn| Uranus| N

In [12]:
def replace_pipes(ans):

  # Drop period at the end of answers and distractors
  return ans.replace("|", ",")


# Remove period suffixes in answers and distractors
synthetic_df["Answer"] = synthetic_df["Answer"].apply(replace_pipes)
synthetic_df["Distractor1"] = synthetic_df["Distractor1"].apply(replace_pipes)
synthetic_df["Distractor2"] = synthetic_df["Distractor2"].apply(replace_pipes)
synthetic_df["Distractor3"] = synthetic_df["Distractor3"].apply(replace_pipes)
synthetic_df.head(5)

Unnamed: 0,Topic,Question,Answer,Distractor1,Distractor2,Distractor3
0,Anatomy,What is the part of the eye that detects color...,Cones,Rods,Retina,Optic nerve
1,Anatomy,Which bone is commonly referred to as the knee...,Patella,Femur,Tibia,Fibula
2,Anatomy,What is the name for the upper arm bone?,Humerus,Radius,Ulna,Femur
3,Anatomy,What part of the brain is responsible for regu...,Hypothalamus,Thalamus,Cerebellum,Medulla oblongata
4,Anatomy,What organ produces insulin in the human body?,Pancreas,Liver,Kidney,Heart


## 1.3 Removing Duplicates

We observe the shape of the data before removing or altering any rows

In [13]:
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6013 entries, 0 to 6130
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        6013 non-null   object
 1   Question     6013 non-null   object
 2   Answer       6013 non-null   object
 3   Distractor1  6013 non-null   object
 4   Distractor2  6013 non-null   object
 5   Distractor3  6013 non-null   object
dtypes: object(6)
memory usage: 328.8+ KB


In [14]:
# MANY questions are identical duplicates, e.g.:
# ...
# 5977    Physics            What is the speed of light in a vacuum?   
# 6000    Physics            What is the speed of light in a vacuum?   
# 6003    Physics            What is the speed of light in a vacuum?
# ...

# group the DataFrame by 'group' column and count the number of rows in each group
### group_sizes = df.groupby('group').size()

# filter out groups of size larger than 1
### filtered_groups = group_sizes[group_sizes != 1].index.tolist()

# filter the original DataFrame by the selected groups
### result = df[df['group'].isin(filtered_groups)]

# How often was each question repeated?
question_counts = synthetic_df.groupby('Question').size()
question_counts = question_counts.sort_values(ascending=False)

# Keep only questions with counts >=2
repeated_questions = question_counts[question_counts > 1].index.tolist()

# print(question_counts.head(60))
# print(question_counts.index.tolist())
# print(f'\nNumber of repeated questions: {len(repeated_questions)}')
# print(repeated_questions)

# THIS PART IS TRICKY... How to keep just ONE copy of a repeated question?
col_names = synthetic_df.columns
repeated_df = pd.DataFrame(columns=col_names)
for question in repeated_questions:
  if question in synthetic_df['Question'].unique():
    curr_df = synthetic_df.loc[synthetic_df['Question'] == question]
    # take most frequent distractors and answer choice from duplicates
    answers = curr_df["Answer"].value_counts()
    dist1 = curr_df["Distractor1"].value_counts()
    dist2 = curr_df["Distractor2"].value_counts()
    dist3 = curr_df["Distractor3"].value_counts()
    curr_dict = {"Topic": [curr_df["Topic"].iloc[0]], "Question": [curr_df["Question"].iloc[0]], "Answer": [answers.idxmax()], "Distractor1": [dist1.idxmax()], "Distractor2": [dist2.idxmax()], "Distractor3": [dist3.idxmax()]}
    curr_df = pd.DataFrame(curr_dict, index=[0])
    repeated_df = pd.concat([repeated_df, curr_df], axis=0)

unique_df = synthetic_df[~synthetic_df['Question'].isin(repeated_questions)]






In [16]:
# View repeated questions
print(question_counts.head(20))
print()

Question
What is the chemical symbol for gold?           78
What is the powerhouse of the cell?             55
What is the formula for acceleration?           55
What is the chemical formula for water?         52
What is the largest organ in the human body?    47
What is the formula for force?                  47
What is the formula for power?                  46
What is the formula for work?                   45
What is the chemical symbol for sodium?         41
What is the formula for kinetic energy?         38
What is the smallest unit of life?              36
What is the chemical symbol for oxygen?         36
What is the law of conservation of energy?      32
What is the SI unit of force?                   32
What is the chemical symbol for iron?           32
What is the formula for pressure?               32
What is the formula for momentum?               31
What is the formula for potential energy?       29
What is the atomic number of carbon?            28
What is the pH of a ne

In [None]:
final_df = pd.concat([unique_df, repeated_df], axis = 0)

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3413 entries, 0 to 0
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        3413 non-null   object
 1   Question     3413 non-null   object
 2   Answer       3413 non-null   object
 3   Distractor1  3413 non-null   object
 4   Distractor2  3413 non-null   object
 5   Distractor3  3413 non-null   object
dtypes: object(6)
memory usage: 186.6+ KB


In [None]:
#synthetic_df.Topic.tolist()

## 1.4 Providing Cleaned Data

Here we check to see the effects of our data cleaning process from sections **1.2** and **1.3** on our dataframe and set it to a variable `synth_df` to be used throughout the rest of the notebook.

In [None]:
synth_df = final_df
synth_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3413 entries, 0 to 0
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        3413 non-null   object
 1   Question     3413 non-null   object
 2   Answer       3413 non-null   object
 3   Distractor1  3413 non-null   object
 4   Distractor2  3413 non-null   object
 5   Distractor3  3413 non-null   object
dtypes: object(6)
memory usage: 186.6+ KB


## 2 Dataset Analysis

#### 2.1 Most common topics?

In [None]:
synth_df

Unnamed: 0,Topic,Question,Answer,Distractor1,Distractor2,Distractor3
0,Anatomy,What is the part of the eye that detects color...,Cones,Rods,Retina,Optic nerve
1,Anatomy,Which bone is commonly referred to as the knee...,Patella,Femur,Tibia,Fibula
2,Anatomy,What is the name for the upper arm bone?,Humerus,Radius,Ulna,Femur
3,Anatomy,What part of the brain is responsible for regu...,Hypothalamus,Thalamus,Cerebellum,Medulla oblongata
4,Anatomy,What organ produces insulin in the human body?,Pancreas,Liver,Kidney,Heart
...,...,...,...,...,...,...
0,Physics,What is the definition of work?,Force times distance,Mass times velocity,Force times velocity,Displacement over time
0,Physics,What is the unit of force in the metric system?,Newton,Joule,Watt,Volt
0,Biology,What is the name of the molecule that carries ...,Hemoglobin,Myoglobin,Chlorophyll,ATP
0,Biology,What is the function of the cerebellum?,To control movement and balance,To process sensory information,To regulate body temperature,To control hormone production


## 3.0 Models

## 3.1 Partitioning Data

In [None]:
# Shuffle the df
synth_mixed_df = synth_df.sample(frac=1, random_state=1234)

train_frac = 0.7  # 70% for training
validation_frac = 0.3   # 30% for testing

train_size = int(train_frac * len(synth_mixed_df))
validation_size = int(validation_frac * len(synth_mixed_df))

synth_train_df = synth_mixed_df.iloc[:train_size]
synth_validation_df = synth_mixed_df.iloc[train_size:train_size + validation_size]

synth_train_df.info()
synth_validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2389 entries, 5273 to 5466
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        2389 non-null   object
 1   Question     2389 non-null   object
 2   Answer       2389 non-null   object
 3   Distractor1  2389 non-null   object
 4   Distractor2  2389 non-null   object
 5   Distractor3  2389 non-null   object
dtypes: object(6)
memory usage: 130.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 3437 to 1266
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Topic        1023 non-null   object
 1   Question     1023 non-null   object
 2   Answer       1023 non-null   object
 3   Distractor1  1023 non-null   object
 4   Distractor2  1023 non-null   object
 5   Distractor3  1023 non-null   object
dtypes: object(6)
memory usage: 55.9+ KB


## 3.2 Shuffle the answer and distractors for each question

In [None]:
# Custom functions to shuffle answer and distractors together

def shuffleABCD(row):
    np.random.shuffle(row.values)
    return row

def shuffleDataset(df):

    # Save ground truth answers
    #answers = synth_train_df[['Question','Answer']]
    answers = df['Answer']


    # Rename the df's columns
    df = df.rename(columns={'Answer': 'A', \
                            'Distractor1': 'B', \
                            'Distractor2': 'C', \
                            'Distractor3': 'D'})
    
    # Shuffle subset of df: answer & distractors
    df_ABCD = df[['A','B','C','D']]
    df_ABCD = df_ABCD.apply(shuffleABCD, axis=1)

    # Re-assign original df columns to shuffled df columns
    df['A'] = df_ABCD['A']
    df['B'] = df_ABCD['B']
    df['C'] = df_ABCD['C']
    df['D'] = df_ABCD['D']

    # Record tuples of answers: ({A,B,C,D}, answer text)
    answer_tuples = [None]*answers.size
    counter = 0

    for index, row in df.iterrows():
        
        # Find and record letter that corresponds to correct answer
        if row['A'] == answers.iat[counter]:
            answer_tuples[counter] = ('A', answers.iat[counter])
        elif row['B'] == answers.iat[counter]:
            answer_tuples[counter] = ('B', answers.iat[counter])
        elif row['C'] == answers.iat[counter]:
            answer_tuples[counter] = ('C', answers.iat[counter])
        elif row['D'] == answers.iat[counter]:
            answer_tuples[counter] = ('D', answers.iat[counter])
        
        counter += 1

    return df, answer_tuples

# TEST of shuffle function:
quiz = synth_train_df.copy(deep=True)

print('PRE-SHUFFLE:')
print(quiz)

quiz, answers = shuffleDataset(quiz)

print('POST_SHUFFLE:')
print(quiz)
print(answers)

PRE-SHUFFLE:
          Topic                                           Question  \
5273    Physics  What is the force that keeps an object in circ...   
524     Biology  What is the genetic material found in all livi...   
1422    Biology  What is the name for the smallest particle of ...   
216   Astronomy  What is the term for a small body of frozen ga...   
2041    Biology  In what part of the body are red blood cells p...   
...         ...                                                ...   
0       Physics                    What is the definition of work?   
0       Biology  What is the process by which plants convert li...   
5568    Physics        What is the equation for calculating force?   
4233    Geology  What is the term for the process by which rock...   
5466    Physics  What kind of electromagnetic radiation has the...   

                    Answer          Distractor1           Distractor2  \
5273     Centripetal force              Gravity        Magnetic force   


In [None]:
#Shuffle the answers and distractors in the datasets
synth_train_X, synth_train_answers = shuffleDataset(synth_train_df)
synth_validation_X, synth_validation_answers = shuffleDataset(synth_validation_df)
sciq_train_X, sciq_train_answers = shuffleDataset(sciq_train_df)
sciq_validation_X, sciq_validation_answers = shuffleDataset(sciq_validation_df)
sciq_test_X, sciq_test_answers = shuffleDataset(sciq_test_df)

'''
print(synth_train_X)
print('------------------------------------')
print(synth_train_answers)
print('------------------------------------')
print(synth_validation_X)
print('------------------------------------')
print(synth_validation_answers)
'''

# Use list comprehensions to make model output 
# JUST the letter of the answer selection
synth_train_answers_ABCD = [tup[0] for tup in synth_train_answers]
synth_validation_answers_ABCD = [tup[0] for tup in synth_validation_answers]
sciq_train_answers_ABCD = [tup[0] for tup in sciq_train_answers]
sciq_validation_answers_ABCD = [tup[0] for tup in sciq_validation_answers]
sciq_test_answers_ABCD = [tup[0] for tup in sciq_test_answers]

'''
print('********************************************************')
print('********************************************************')
print('********************************************************')

print(synth_train_X)
print('------------------------------------')
print(synth_train_answers_ABCD)
print('------------------------------------')
print(synth_validation_X)
print('------------------------------------')
print(synth_validation_answers_ABCD)


print('********************************************************')
print('********************************************************')
print('********************************************************')
'''

print(sciq_test_X)
print('------------------------------------')
print(sciq_test_answers)
print('------------------------------------')
print(sciq_test_answers_ABCD)

                                              Question                     D  \
0    Compounds that are capable of accepting electr...                Oxygen   
1    What term in biotechnology means a genetically...                 adult   
2    Vertebrata are characterized by the presence o...                 Bones   
3    What is the height above or below sea level ca...              latitude   
4    Ice cores, varves and what else indicate the e...                 magma   
..                                                 ...                   ...   
995  In the case of the moose, predation is an addi...                 color   
996            Where do short period comets come from?           kuiper belt   
997  Only after implantation can an embryo develop ...                mammal   
998  What are atoms with unstable nuclei are consid...           radioactive   
999          In what form is atmospheric sulfur found?  sulfur dioxide (so2)   

                   B              C    

## 3.3 Model Fine-tuning

In [None]:
# Install and import required libraries

! pip install simplet5
! pip install transformers
! pip install torch
! pip install --upgrade sentencepiece

import torch
import transformers
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import T5Tokenizer
import sys
from simplet5 import SimpleT5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [None]:
#@title Data preprocess (for use with simpleT5)
def preprocess_data(X, answers, max_seq_length=512):
    
    print('Processing X...')
    #input_ids = [None]*X.shape[0]
    #attention_mask = [None]*X.shape[0]
    
    source_text = [None]*X.shape[0]
    counter = 0
    for index, row in X.iterrows():

        #currInput = 'Pick the right answer to this multiple choice question: '
        #currInput += row['Question']
        
        currInput = row['Question'].lower()
        currInput += ' \\n (a) ' + row['A'].lower()
        currInput += ' (b) ' + row['B'].lower()
        currInput += ' (c) ' + row['C'].lower()
        currInput += ' (d) ' + row['D'].lower()# + ' </s>'
        #print(currInput)
        
        #currInputIDs = tokenizer.encode_plus(currInput, 
        #                                    padding='max_length', 
        #                                    truncation=True, 
        #                                    max_length=max_seq_length,
        #                                    return_tensors='pt')
        
        source_text[counter] = currInput
        
        #input_ids[counter] = currInput
        #input_ids[counter] = currInputIDs.to_tensor()
        #input_ids[counter] = torch.tensor(currInputIDs.input_ids)
        #input_ids[counter] = currInputIDs.input_ids.clone().detach()

        #attention_mask[counter] = currInputIDs.ne(0)
        #attention_mask[counter] = (currInputIDs != 0).to_tensor()
        #attention_mask[counter]= torch.tensor((currInputIDs != 0))
        #attention_mask[counter]= torch.tensor(currInputIDs.attention_mask)
        #attention_mask[counter]= currInputIDs.attention_mask.clone().detach()

        counter += 1


    print('Processing answers...')
    #target_ids = [None]*answers.shape[0]
    #target_attention_mask = [None]*answers.shape[0]
    #target_text = [None]*answers.shape[0]
    target_text = [None]*len(answers)

    counter = 0
    #for index, row in answers.iterrows():
    for tup in answers:

        #currTarget = row['Answer']
        currTarget = tup[0].lower()

        #print(currTarget)
        #currTargetIDs = tokenizer.encode_plus(currTarget, 
        #                                    padding='max_length', 
        #                                    truncation=True, 
        #                                    max_length=max_seq_length, 
        #                                    return_tensors='pt')
        
        target_text[counter] = currTarget
        
        #target_ids[counter] = currTargetIDs.to_tensor()
        #target_ids[counter] = torch.tensor(currTargetIDs.input_ids)
        #target_ids[counter] = currTargetIDs.input_ids.clone().detach()

        #target_attention_mask[counter] = currTargetIDs.ne(0)
        #target_attention_mask[counter] = (currTargetIDs != 0).to_tensor()
        #target_attention_mask[counter] = torch.tensor((currTargetIDs != 0))
        #target_attention_mask[counter] = torch.tensor(currTargetIDs.attention_mask)
        #target_attention_mask[counter] = currTargetIDs.attention_mask.clone().detach()

        counter += 1

    #input_ids = torch.cat(input_ids, dim=0)
    #attention_mask = torch.cat(attention_mask, dim=0)
    #target_ids = torch.cat(target_ids, dim=0)
    #target_attention_mask = torch.cat(target_attention_mask, dim=0)

    output_df = pd.DataFrame(data = {'source_text': source_text, 'target_text': target_text })
    return output_df
    #return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target_ids': target_ids, 'target_attention_mask': target_attention_mask}

In [None]:
#@title Get Un-fine-tuned base T5 Model



'''
null_df = pd.DataFrame(
    data={'source_text': [''], 
          'target_text': ['']})

#null_df = pd.DataFrame(columns=['source_text','target_text'])

null_train_df = null_df
null_validation_df = null_df

base_model = SimpleT5()
base_model.from_pretrained(model_type="t5", model_name="allenai/unifiedqa-t5-small")

# Perform [null] training using below settings
base_model.train(train_df= None, #null_train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df= None, #null_validation_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512, 
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "base_outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )
'''

'\nnull_df = pd.DataFrame(\n    data={\'source_text\': [\'\'], \n          \'target_text\': [\'\']})\n\n#null_df = pd.DataFrame(columns=[\'source_text\',\'target_text\'])\n\nnull_train_df = null_df\nnull_validation_df = null_df\n\nbase_model = SimpleT5()\nbase_model.from_pretrained(model_type="t5", model_name="allenai/unifiedqa-t5-small")\n\n# Perform [null] training using below settings\nbase_model.train(train_df= None, #null_train_df, # pandas dataframe with 2 columns: source_text & target_text\n            eval_df= None, #null_validation_df, # pandas dataframe with 2 columns: source_text & target_text\n            source_max_token_len = 512, \n            target_max_token_len = 128,\n            batch_size = 8,\n            max_epochs = 3,\n            use_gpu = True,\n            outputdir = "base_outputs",\n            early_stopping_patience_epochs = 0,\n            precision = 32\n            )\n'

In [None]:
#@title Fine-tune Synthetic Dataset T5 Model

#tokenizer = T5Tokenizer.from_pretrained('t5-base')
#tokenizer = T5Tokenizer.from_pretrained("allenai/unifiedqa-t5-small")

from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from simplet5 import SimpleT5

synth_train_df = preprocess_data(synth_train_X, synth_train_answers_ABCD)
synth_validation_df = preprocess_data(synth_validation_X, synth_validation_answers_ABCD)

# *** Make synth_validation_df the same # of rows/instances as sciq_validation_df ***
# i.e. Use only first 1000 rows out of the 1023 in synth_validation_df set 
synth_validation_df = synth_validation_df.iloc[:1000]

#print('Preprocessing complete')

synth_model = SimpleT5()
#model = T5ForConditionalGeneration.from_pretrained('t5-small')
#model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-small")

# Choose which pre-trained model type to use:
# -------------------------------------------
#model.from_pretrained(model_type="t5", model_name="t5-base")
synth_model.from_pretrained(model_type="t5", model_name="allenai/unifiedqa-t5-small")

# Perform training using below settings
synth_model.train(train_df=synth_train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=synth_validation_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512, 
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "synth_outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

Processing X...
Processing answers...
Processing X...
Processing answers...


INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
# @title Fine-tune Sciq Dataset T5 Model

sciq_train_df = preprocess_data(sciq_train_X, sciq_train_answers_ABCD)
sciq_validation_df = preprocess_data(sciq_validation_X, sciq_validation_answers_ABCD)
sciq_test_df = preprocess_data(sciq_test_X, sciq_test_answers_ABCD)

sciq_train_df

#print('Preprocessing complete')

sciq_model = SimpleT5()
#model = T5ForConditionalGeneration.from_pretrained('t5-small')
#model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-small")

# *** Make sciq_train_df the same # of rows/instances as synth_train_df ***
# i.e. Use only first 2389 rows out of the 11679 in sciq_train_df set 
sciq_train_df = sciq_train_df.iloc[:2389]   #[:2389]

#model.from_pretrained(model_type="t5", model_name="t5-base")
sciq_model.from_pretrained(model_type="t5", model_name="allenai/unifiedqa-t5-small")
# train

sciq_model.train(train_df=sciq_train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=sciq_validation_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512, 
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "sciq_outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

Processing X...
Processing answers...
Processing X...
Processing answers...
Processing X...
Processing answers...


INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

## 3.4 Model Evaluations

In [None]:
#@title Load models (requires manual input for now)

# IN GOOGLE COLAB:
# --Go to "Files" (lefthand sidebar)
# --Expand "outputs" folder
# --Find subfolder looking like "simplet5-epoch-0-train-loss-0.5888-val-loss-0.3764"
# --Click the triple dots next to this subfolder --> "Copy path"
# --Paste the copied path string into the second arguments of
#        synth_model.load_model and below

# (^ Work in progress: How to do this programmatically?)

synth_model.load_model('t5', 
                       #'/content/synth_outputs/simplet5-epoch-0-train-loss-0.9594-val-loss-0.7024')
                       #'/content/synth_outputs/simplet5-epoch-0-train-loss-0.9811-val-loss-0.7066'
                       ### Used more training epochs: 1 --> 3
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.6556-val-loss-0.6043')
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.6392-val-loss-0.5925')
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.667-val-loss-0.6221'
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.6638-val-loss-0.5975'
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.6079-val-loss-0.5632')
                       ### Improved formatting of source_text to better match UnifiedQA base model
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.4309-val-loss-0.3798')
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.4147-val-loss-0.3769')
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.4254-val-loss-0.3775')
                       ### Half-size Sciq training dataset experiments
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.4178-val-loss-0.3662')
                       #'/content/synth_outputs/simplet5-epoch-2-train-loss-0.4175-val-loss-0.3756')
                       "/content/synth_outputs/simplet5-epoch-2-train-loss-0.418-val-loss-0.3758")

sciq_model.load_model('t5',
                      #'/content/sciq_outputs/simplet5-epoch-0-train-loss-0.7752-val-loss-0.6147'
                      #'/content/sciq_outputs/simplet5-epoch-0-train-loss-0.7689-val-loss-0.5948'
                      ### Used more training epochs: 1 --> 3
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.5233-val-loss-0.5479'
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.53-val-loss-0.5288'
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.5214-val-loss-0.5806'
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.6754-val-loss-0.646')
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.6327-val-loss-0.6544')
                      ### Improved formatting of source_text to better match UnifiedQA base model
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4396-val-loss-0.4179')
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4323-val-loss-0.4187')
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4333-val-loss-0.4236')
                      ### Half-size Sciq training dataset experiments
                      #'/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4774-val-loss-0.4415')
                      # '/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4737-val-loss-0.4527')
                      "/content/sciq_outputs/simplet5-epoch-2-train-loss-0.4319-val-loss-0.417")


404 Client Error: Not Found for url: https://huggingface.co//content/synth_outputs/simplet5-epoch-2-train-loss-0.418-val-loss-0.3758/resolve/main/config.json


OSError: ignored

## 3.4 Generate Model Predictions on Eval Sets

In [None]:
sciq_test_df 

In [None]:
# Generate test set predictions using the fine-tuned models:
def predict_all(model, eval_df, eval_df_X):

    questions = [None]*eval_df.shape[0]
    correct_ABCD = [None]*eval_df.shape[0]
    correct_answer = [None]*eval_df.shape[0]
    prediction_ABCD = [None]*eval_df.shape[0]
    prediction_answer = [None]*eval_df.shape[0]
    counter = 0

    for index, row in eval_df.iterrows() :
        
        # Dev test code: Test run first 6 eval questions
        '''
        prediction = model.predict(row['source_text'])
        print(index)
        print('Question:')
        print(row['source_text'])
        print('------------------------------')
        print('Correct answer:')
        print(row['target_text'])
        print(eval_df_X.iat[index, eval_df_X.columns.get_loc(row['target_text'].upper())])
        print('------------------------------')
        print('Predicted answer:')
        print(prediction[0])
        if prediction[0] in ['a','b','c','d']:
            print(eval_df_X.iat[index, eval_df_X.columns.get_loc(prediction[0].upper())])
        else:
            print('[Prediction is not in {A,B,C,D}]')
        print('\n')
        if index == 5:
            sys.exit('[END OF TEST]')
        '''

        # Question/prompt fed into model:
        questions[counter] = row['source_text']
        #print(questions[counter])


        # The letter corresponding to the correct answer:
        correct_ABCD[counter] = row['target_text']
        #print(correct_ABCD[counter])

        # What the correct answer corresponds to:
        correct_answer[counter] = eval_df_X.iat[index, eval_df_X.columns.get_loc(row['target_text'].upper())]
        #correct_answer[counter] = eval_df_X.at[index, eval_df_X.columns.get_loc(row['target_text'].upper())]
        
        #print(correct_answer[counter])

        # The letter of the answer predicted by the model:
        prediction_ABCD[counter] = model.predict(row['source_text'])[0]
        #print(prediction_ABCD[counter])

        # What the predicted letter answer corresponds to
        # (if the predicted letter is 'a','b','c','d'):
        if prediction_ABCD[counter] in ['a','b','c','d']:
            prediction_answer[counter] = \
                eval_df_X.iat[index, eval_df_X.columns.get_loc(prediction_ABCD[counter].upper())]
        else:
            prediction_answer[counter] = '[ERROR: Prediction is not in {a,b,c,d}]'

        counter += 1

    # Collect accumulated results together into a DataFrame
    output_df = pd.DataFrame(data={'Question': questions,
                                    'CorrectABCD': correct_ABCD,
                                    'CorrectAnswer': correct_answer,
                                    'PredictedABCD': prediction_ABCD,
                                    'PredictedAnswer': prediction_answer})

    return output_df 

from google.colab import files

synth_predictions = predict_all(synth_model, sciq_test_df, sciq_test_X)
#print(synth_predictions)
synth_predictions.to_csv('synth_predictions.csv', encoding = 'utf-8-sig')
files.download('synth_predictions.csv')

sciq_predictions = predict_all(sciq_model, sciq_test_df, sciq_test_X)
sciq_predictions.to_csv('sciq_predictions.csv', encoding = 'utf-8-sig')
files.download('sciq_predictions.csv')


In [None]:
sciq_test_X

In [None]:
# Generate test set predictions using the base (not fine-tuned) model:

from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "allenai/unifiedqa-t5-small" # you can specify the model type/size here
tokenizer = T5Tokenizer.from_pretrained(model_name)
base_model = T5ForConditionalGeneration.from_pretrained(model_name)

def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = base_model.generate(input_ids, **generator_args)
    return tokenizer.batch_decode(res, skip_special_tokens=True)

# For un-fine-tuned base models:
def base_predict_all(eval_df, eval_df_X):

    questions = [None]*eval_df.shape[0]
    correct_ABCD = [None]*eval_df.shape[0]
    correct_answer = [None]*eval_df.shape[0]
    prediction_ABCD = [None]*eval_df.shape[0]
    prediction_answer = [None]*eval_df.shape[0]
    counter = 0

    for index, row in eval_df.iterrows() :
        

        # Question fed into model:
        questions[counter] = row['source_text']
        #print(questions[counter])


        # The letter corresponding to the correct answer:
        correct_ABCD[counter] = row['target_text']
        #print(correct_ABCD[counter])

        # What the correct answer corresponds to:
        correct_answer[counter] = eval_df_X.iat[index, eval_df_X.columns.get_loc(row['target_text'].upper())]
        #print(correct_answer[counter])

        # The answer predicted by the base model:
        prediction_answer[counter] = run_model(row['source_text'])[0]
        #print(prediction_ABCD[counter])

        # The letter the predicted answer corresponds to:

        if prediction_answer[counter] == eval_df_X.iat[index, eval_df_X.columns.get_loc('A')]:
            prediction_ABCD[counter] = 'a'
        elif prediction_answer[counter] == eval_df_X.iat[index, eval_df_X.columns.get_loc('B')]:
            prediction_ABCD[counter] = 'b'
        elif prediction_answer[counter] == eval_df_X.iat[index, eval_df_X.columns.get_loc('C')]:
            prediction_ABCD[counter] = 'c'
        elif prediction_answer[counter] == eval_df_X.iat[index, eval_df_X.columns.get_loc('D')]:
            prediction_ABCD[counter] = 'd'
        else:
            prediction_answer[counter] = '[ERROR: Prediction does not map to {a,b,c,d}]'

        counter += 1

    # Collect accumulated results together into a DataFrame
    output_df = pd.DataFrame(data={'Question': questions,
                                    'CorrectABCD': correct_ABCD,
                                    'CorrectAnswer': correct_answer,
                                    'PredictedABCD': prediction_ABCD,
                                    'PredictedAnswer': prediction_answer})

    return output_df 


#test_question = '''Pick the right answer to this multiple choice question: What is the name of the substance that is doing the dissolving in a solution?\nA) Concentration B) Solute C) Solvent D) Solution </s>'''
#results = synth_model.predict(test_question)
#print(results)


base_predictions = base_predict_all(sciq_test_df, sciq_test_X)
print(base_predictions)
base_predictions.to_csv('base_predictions.csv', encoding = 'utf-8-sig')
files.download('base_predictions.csv')



In [None]:
#base_predictions
#sciq_test_X

In [None]:
# Assess how fine model performs overall across all questions
def overall_accuracy(preds_df):

    # Calculate number of rows where predicted letter matches actual correct letter
    num_correct = preds_df[preds_df['PredictedABCD'] == preds_df['CorrectABCD']].shape[0]

    # Return overall accuracy: # of correct predictions / total # of all predictions   
    return num_correct / preds_df.shape[0]

print(overall_accuracy(base_predictions))
print(overall_accuracy(synth_predictions))
print(overall_accuracy(sciq_predictions))

In [None]:
# Assess accuracy, precision, recall, and f1 score for each letter class (a,b,c,d)
def binary_letter_metrics(preds_df, letter):
    
    # Count of true positives: Model predicted letter, 
    #                          and correct answer was letter
    num_true_pos = preds_df.loc[(preds_df['PredictedABCD'] == letter) 
                                 & (preds_df['CorrectABCD'] == letter)].shape[0]

    # Count of true negatives: Model predicted NOT letter, 
    #                          and correct answer was NOT letter
    num_true_neg = preds_df.loc[(preds_df['PredictedABCD'] != letter)
                                & (preds_df['CorrectABCD'] != letter)].shape[0]

    # Count of false positives: Model predicted letter,
    #                           but correct answer was NOT letter
    num_false_pos = preds_df.loc[(preds_df['PredictedABCD'] == letter)
                                 & (preds_df['CorrectABCD'] != letter)].shape[0]

    # Count of false negatives: Model predicted NOT letter,
    #                           but correct answer was letter
    num_false_neg = preds_df.loc[(preds_df['PredictedABCD'] != letter)
                                 & (preds_df['CorrectABCD'] == letter)].shape[0]    

    # Total count of rows
    num_total = preds_df.shape[0]

    # Dev test: Print out intermediate values
    print(f'num_true_pos = {num_true_pos}')
    print(f'num_true_neg = {num_true_neg}')
    print(f'num_false_pos = {num_false_pos}')
    print(f'num_false_neg = {num_false_neg}')
    print(f'num_total = {num_total}')



    # Calculate letter-specific accuracy
    accuracy = (num_true_pos + num_true_neg) / num_total

    # Calculate letter-specific precision
    precision = num_true_pos / (num_true_pos + num_false_pos)

    # Calculate letter-specific recall
    recall = num_true_pos / (num_true_pos + num_false_neg)

    # Calculate letter-specific f1 score
    f1_score = (2 * precision * recall) / (precision + recall)


    # Dictionary to record letter-specific metric calculations
    outputs_dict = {}
    outputs_dict[letter + '_accuracy'] = accuracy
    outputs_dict[letter + '_precision'] = precision
    outputs_dict[letter + '_recall'] = recall
    outputs_dict[letter + '_f1'] = f1_score

    df_dict = {}
    df_dict['accuracy'] = accuracy
    df_dict['precision'] = precision
    df_dict['recall'] = recall
    df_dict['f1'] = f1_score
    df_dict['TP'] = num_true_pos
    df_dict['TN'] = num_true_neg
    df_dict['FP'] = num_false_pos
    df_dict['FN'] = num_false_neg

    return (outputs_dict, df_dict)


print(binary_letter_metrics(synth_predictions, 'a')[0])
print('---------------------------------------------')
print(binary_letter_metrics(synth_predictions, 'b')[0])
print('---------------------------------------------')
print(binary_letter_metrics(synth_predictions, 'c')[0])
print('---------------------------------------------')
print(binary_letter_metrics(synth_predictions, 'd')[0])
print('---------------------------------------------')
print('---------------------------------------------')
print('---------------------------------------------') 
print(binary_letter_metrics(sciq_predictions, 'a')[0])
print('---------------------------------------------') 
print(binary_letter_metrics(sciq_predictions, 'b')[0])
print('---------------------------------------------')
print(binary_letter_metrics(sciq_predictions, 'c')[0])
print('---------------------------------------------')
print(binary_letter_metrics(sciq_predictions, 'd')[0])
print('---------------------------------------------')
print('---------------------------------------------')
print('---------------------------------------------')
print('---------------------------------------------') 
print(binary_letter_metrics(base_predictions, 'a')[0])
print('---------------------------------------------') 
print(binary_letter_metrics(base_predictions, 'b')[0])
print('---------------------------------------------')
print(binary_letter_metrics(base_predictions, 'c')[0])
print('---------------------------------------------')
print(binary_letter_metrics(base_predictions, 'd')[0])
print('---------------------------------------------')

In [None]:
# @title Save out results to csv
from google.colab import files

synth_a = binary_letter_metrics(synth_predictions, 'a')[1]
synth_b = binary_letter_metrics(synth_predictions, 'b')[1]
synth_c = binary_letter_metrics(synth_predictions, 'c')[1]
synth_d = binary_letter_metrics(synth_predictions, 'd')[1]
synth_res_df = pd.DataFrame([synth_a, synth_b, synth_c, synth_d])
synth_res_df.to_csv('synth_results.csv', encoding = 'utf-8-sig')
files.download('synth_results.csv')

sciq_a = binary_letter_metrics(sciq_predictions, 'a')[1]
sciq_b = binary_letter_metrics(sciq_predictions, 'b')[1]
sciq_c = binary_letter_metrics(sciq_predictions, 'c')[1]
sciq_d = binary_letter_metrics(sciq_predictions, 'd')[1]
sciq_res_df = pd.DataFrame([sciq_a, sciq_b, sciq_c, sciq_d])
sciq_res_df.to_csv('sciq_results.csv', encoding = 'utf-8-sig')
files.download('sciq_results.csv')

base_a = binary_letter_metrics(base_predictions, 'a')[1]
base_b = binary_letter_metrics(base_predictions, 'b')[1]
base_c = binary_letter_metrics(base_predictions, 'c')[1]
base_d = binary_letter_metrics(base_predictions, 'd')[1]
base_res_df = pd.DataFrame([base_a, base_b, base_c, base_d])
base_res_df.to_csv('base_results.csv', encoding = 'utf-8-sig')
files.download('base_results.csv')



In [None]:
print(f'sciq_train_df.shape: {sciq_train_df.shape}')
print(f'sciq_validation_df.shape: {sciq_validation_df.shape}')
print(f'synth_train_df.shape: {synth_train_df.shape}')
print(f'synth_validation_df.shape: {synth_validation_df.shape}')
print(f'sciq_test_df.shape: {sciq_test_df.shape}')

# 4.0 Data Analysis

In [None]:
#@title Load in the predictions from the three models
import csv
import pandas as pd

url = 'https://raw.githubusercontent.com/alidiusk/cs2731-final-project/main/results/base_predictions.csv'
base_pred_df = pd.read_csv(url, skip_blank_lines=True, encoding="ISO_8859-1")
base_pred_df.head()

In [None]:
# Sciq predictions
url = 'https://raw.githubusercontent.com/alidiusk/cs2731-final-project/main/results/sciq_predictions.csv'
sciq_pred_df = pd.read_csv(url, skip_blank_lines=True, encoding="ISO_8859-1")
sciq_pred_df.head()

In [None]:
# Synthetic predictions
url = 'https://raw.githubusercontent.com/alidiusk/cs2731-final-project/main/results/synth_predictions.csv'
synth_pred_df = pd.read_csv(url, skip_blank_lines=True, encoding="ISO_8859-1")
synth_pred_df.head()

In [None]:
#@title Create confusion matrix based on each model's predictions
from sklearn.metrics import confusion_matrix

# Was having a nan issue
base_pred_df = base_pred_df.dropna(subset=['PredictedABCD', 'CorrectABCD'])
base_confusion = confusion_matrix(base_pred_df["CorrectABCD"], base_pred_df["PredictedABCD"])
print('Base Model Confusion Matrix')
print(base_confusion)

sciq_confusion = confusion_matrix(sciq_pred_df["CorrectABCD"], sciq_pred_df["PredictedABCD"])
print('\nSciq Model Confusion Matrix')
print(sciq_confusion)

synth_confusion = confusion_matrix(synth_pred_df["CorrectABCD"], synth_pred_df["PredictedABCD"])
print('\nSynth Model Confusion Matrix')
print(synth_confusion)

In [None]:
from sklearn.metrics import classification_report
base_report = pd.DataFrame(classification_report(base_pred_df["CorrectABCD"], base_pred_df["PredictedABCD"], output_dict=True))
sciq_report = pd.DataFrame(classification_report(sciq_pred_df["CorrectABCD"], sciq_pred_df["PredictedABCD"], output_dict=True))
synth_report = pd.DataFrame(classification_report(synth_pred_df["CorrectABCD"], synth_pred_df["PredictedABCD"], output_dict=True))

In [None]:
#@title show base model classification report
base_report

In [None]:
#@title show sciq model classification report
sciq_report

In [None]:
#@title show synth model classification report
synth_report

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
fx_base=sns.heatmap(base_confusion, annot=True, fmt=".2f",cmap="GnBu")
fx_base.set_title('Base Model Confusion Matrix \n');
fx_base.set_xlabel('\n Predicted Values\n')
fx_base.set_ylabel('Actual Values\n');
fx_base.xaxis.set_ticklabels(['A','B','C', 'D'])
fx_base.yaxis.set_ticklabels(['A','B','C', 'D'])
plt.show()

In [None]:
plt.figure(figsize=(10,6))
fx_sciq=sns.heatmap(sciq_confusion, annot=True, fmt=".2f",cmap="GnBu")
fx_sciq.set_title('Sciq Model Confusion Matrix \n');
fx_sciq.set_xlabel('\n Predicted Values\n')
fx_sciq.set_ylabel('Actual Values\n');
fx_sciq.xaxis.set_ticklabels(['A','B','C', 'D'])
fx_sciq.yaxis.set_ticklabels(['A','B','C', 'D'])
plt.show()

In [None]:
plt.figure(figsize=(10,6))
fx_synth=sns.heatmap(synth_confusion, annot=True, fmt=".2f",cmap="GnBu")
fx_synth.set_title('Synth Model Confusion Matrix \n');
fx_synth.set_xlabel('\n Predicted Values\n')
fx_synth.set_ylabel('Actual Values\n');
fx_synth.xaxis.set_ticklabels(['A','B','C', 'D'])
fx_synth.yaxis.set_ticklabels(['A','B','C', 'D'])
plt.show()

In [None]:
#@title Explore counts of correct answers test datasets

print("Base Counts")
print(base_pred_df["CorrectABCD"].value_counts())

print("\nSciq Counts")
print(sciq_pred_df["CorrectABCD"].value_counts())

print("\nSynth Counts")
print(synth_pred_df["CorrectABCD"].value_counts())

print("\nSciq Train")
print(sciq_train_df["target_text"].value_counts())

print("\nSynth Train")
print(synth_train_df["target_text"].value_counts())

print("\nSciq Validation")
print(sciq_validation_df["target_text"].value_counts())

print("\nSynth Validation")
print(synth_validation_df["target_text"].value_counts())