# Loading Questions Dataset

In [1]:
import requests
import json

url = "https://ap-southeast-1.aws.data.mongodb-api.com/app/data-pgkmv/endpoint/data/v1/action/find"

subject_name = 'h2_mathematics'

payload = json.dumps({
    "collection": "questions",
    "database": f"{subject_name}",
    "dataSource": "Cluster0",

    # change the filter parameters below as fit
    # filter parameters : 'paper_no', 'source', 'paper_type', 'qn_no', 'year'
    "filter": {

    },

    # qn_content is the string of the questions
    # remove unnecessary parameters if the information is not needed
    "projection": {
        "paper_no": 1,
        "source": 1,
        "paper_type": 1,
        "qn_no": 1,
        "year": 1,
        "qn_content": 1
    }
})

headers = {
    'Content-Type': 'application/json',
    'Access-Control-Request-Headers': '*',

    # provide the API key
    'api-key': 'fxhN8w9bOuk37F8FC3lGwqs0MpgGvH5qo0SubuFiUEUHXmeX0dl85c7mBRDoPI1N',
}

response = requests.request("POST", url, headers=headers, data=payload)

# qn_dict is a list of dict objects that represents a question
qn_dict = json.loads(response.text)['documents']

# Get Embedding for Each Topic

## Open AI's ADA

In [1]:
import openai

openai.api_key = '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'

model_name = 'ADA'

with open('./static/topic_list.txt') as f:
    reader = list(f.readlines())
    reader = [i.strip() for i in reader]

topic_embeddings = []

for topic in reader:
    embedding = openai.Embedding.create(
        input=topic, model="text-embedding-ada-002"
    )["data"][0]["embedding"]
    topic_embeddings.append(embedding)

emb_dict = {}

for i in range(len(reader)):
    emb_dict[reader[i]] = topic_embeddings[i]


## Open AI GPT3.5 Turbo

In [None]:
import openai
import os
from tqdm import tqdm

model_name = 'ADA'

with open('query.txt') as f:
    query = f.read()


for yr in tqdm(range(2016, 2023)):
    qn_list = os.listdir(f'./{yr}')

    for qn in tqdm(qn_list):
        with open(f'./{yr}/{qn}') as f:
            qn_content = f.read()
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"{query}"},
                {"role": "user", "content": f"{qn_content}"},
            ]
        )

        result = ''
        for choice in response.choices:
            result += choice.message.content

        yr, paper_type, paper_no, qn_no = qn.split('_')
        with open('chat_gpt.csv', 'a') as fappend:
            fappend.write(f'{yr}, {paper_no[1:]}, {qn_no[1:-4]}, {response}\n')
            print(f'{yr}, {paper_no[1:]}, {qn_no[1:-4]} Appended')

## Hugging Face Models

### Using `sentence_transformers` Python package (Recommended)

In [None]:
from sentence_transformers import SentenceTransformer

model_name = 'sentence-t5-base'

model_st = SentenceTransformer(model_name)

with open('./static/topic_list.txt') as f:
    reader = list(f.readlines())
    reader = [i.strip() for i in reader]

topic_embeddings = []

for topic in reader:
    embedding = list(model_st.encode(topic))
    topic_embeddings.append(embedding)

emb_dict = {}
for i in range(len(reader)):
    emb_dict[reader[i]] = topic_embeddings[i]

### Using Hugging Face's API (Not really reliable)

In [None]:
from retry import retry

model_id = "sentence-transformers/all-MiniLM-L6-v2" #Insert sentence-transformers model here
hf_token = "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" #Insert hugging face token here

model_name = model_id.split('/')[-1]

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

@retry(tries=3, delay=10)
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts})
    result = response.json()
    if isinstance(result, list):
      return result
    elif list(result.keys())[0] == "error":
      raise RuntimeError(
          "The model is currently loading, please re-run the query."
      )

with open('./static/topic_list.txt') as f:
    reader = list(f.readlines())
    reader = [i.strip() for i in reader]

topic_embeddings = []

for topic in reader:
    embedding = query(topic)
    topic_embeddings.append(embedding)

emb_dict = {}
for i in range(len(reader)):
    emb_dict[reader[i]] = topic_embeddings[i]

# Computing Cosine Similarity and Ranking

In [22]:
from scipy import spatial

with open(f'classifier_{model_name}.csv', 'w') as f:

    for qn in qn_dict:

        qn_reader = qn['qn_content']

        qn_emb = list(model_st.encode(qn_reader))

        top_ranking = []
        for top in emb_dict.keys():
            top_ranking.append(
                [top, 1 - spatial.distance.cosine(qn_emb, emb_dict[top])])
        top_ranking.sort(key=lambda x: x[1], reverse=True)

        if 'paper_type' in qn.keys():
            f.write(
                f"{qn['year']}, {qn['source']}, {qn['paper_type']}, {qn['paper_no']}, {qn['qn_no']}, {top_ranking[0][0]}\n")
            print(
                f"{qn['year']}, {qn['source']}, {qn['paper_type']}, {qn['paper_no']}, {qn['qn_no']} written")
        else:
            f.write(
                f"{qn['year']}, {qn['source']}, {qn['paper_no']}, {qn['qn_no']}, {top_ranking[0][0]}\n")
            print(
                f"{qn['year']}, {qn['source']}, {qn['paper_no']}, {qn['qn_no']} written")

[['Equations and Inequalities', 0.7774526555953692], ['Differential Equations', 0.767300029288762], ['Probability', 0.7649195363437734], ['Arithmetic and Geometric Progression Series', 0.7552602909489934], ['Binomial and Normal Distributions', 0.7528335041950401], ['Hypothesis Testing', 0.7506100300724892], ['Permutations and Combinations', 0.7490133732071345], ['Graphing', 0.7480238521710697], ['Complex Numbers', 0.739927314630146], ['Discrete Random Variables', 0.7356197988835765], ['Functions', 0.73542898382419], ['Correlation and Regression', 0.733309480048664], ['Applications of Integrations - Area and Volume', 0.7321327714008581], ["Binomial Expansion, Maclaurin's Series and Small Angle Approximations", 0.7318222419675863], ['Sequences and Series', 0.730393367644041], ['Integration', 0.7228908565779405], ['Vectors', 0.7209592021343751], ['Differentiation and its Applications', 0.7195421110201605]]
2012, ALVL, 1, 1 written
[['Applications of Integrations - Area and Volume', 0.8135

RateLimitError: Rate limit reached for default-global-with-image-limits in organization org-eRxa3T3nXgybfFy0Bs1MYGHH on requests per min. Limit: 60 / min. Please try again in 1s. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

# Model used
https://www.sbert.net/docs/pretrained_models.html#semantic-search
- all-distilroberta-v1
- all-MiniLM-L12-v1
- all-mpnet-base-v2
- gtr-t5-base
- all-sentence-t5-base
- openai GPT3.5
- ada

measure 
- cosine similarity

In [46]:
from sentence_transformers import SentenceTransformer

model_name = 'sentence-t5-base'

model_st = SentenceTransformer(model_name)

with open('./static/topic_list.txt') as f:
    reader = list(f.readlines())
    reader = [i.strip() for i in reader]

topic_embeddings = []

for topic in reader:
    embedding = list(model_st.encode(topic))
    topic_embeddings.append(embedding)

emb_dict = {}
for i in range(len(reader)):
    emb_dict[reader[i]] = topic_embeddings[i]

Downloading (…)2bb58/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.17MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 190kB/s]
Downloading (…)/2_Dense/config.json: 100%|██████████| 115/115 [00:00<00:00, 115kB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.36M/2.36M [00:00<00:00, 19.8MB/s]
Downloading rust_model.ot: 100%|██████████| 2.36M/2.36M [00:00<00:00, 18.6MB/s]
Downloading (…)21dd52bb58/README.md: 100%|██████████| 2.01k/2.01k [00:00<00:00, 2.01MB/s]
Downloading (…)dd52bb58/config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 693kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 122kB/s]
Downloading (…)52bb58/convert.ipynb: 100%|██████████| 74.6k/74.6k [00:00<00:00, 7.46MB/s]
Downloading (…)8/convert_to_fp16.py: 100%|██████████| 198/198 [00:00<00:00, 197kB/s]
Downloading pytorch_model.bin: 100%|██████████| 219M/219M [00:06<00:00, 35.8MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 

In [47]:
from scipy import spatial

with open(f'classifier_{model_name}.csv', 'w') as f:

    for qn in qn_dict:

        qn_reader = qn['qn_content']

        qn_emb = list(model_st.encode(qn_reader))

        top_ranking = []
        for top in emb_dict.keys():
            top_ranking.append(
                [top, 1 - spatial.distance.cosine(qn_emb, emb_dict[top])])
        top_ranking.sort(key=lambda x: x[1], reverse=True)

        if 'paper_type' in qn.keys():
            f.write(
                f"{qn['year']}, {qn['source']}, {qn['paper_type']}, {qn['paper_no']}, {qn['qn_no']}, {top_ranking[0][0]}\n")
            print(
                f"{qn['year']}, {qn['source']}, {qn['paper_type']}, {qn['paper_no']}, {qn['qn_no']} written")
        else:
            f.write(
                f"{qn['year']}, {qn['source']}, {qn['paper_no']}, {qn['qn_no']}, {top_ranking[0][0]}\n")
            print(
                f"{qn['year']}, {qn['source']}, {qn['paper_no']}, {qn['qn_no']} written")

2012, ALVL, 1, 1 written
2012, ALVL, 1, 10 written
2012, ALVL, 1, 11 written
2012, ALVL, 1, 2 written
2012, ALVL, 1, 3 written
2012, ALVL, 1, 4 written
2012, ALVL, 1, 5 written
2012, ALVL, 1, 6 written
2012, ALVL, 1, 7 written
2012, ALVL, 1, 8 written
2012, ALVL, 1, 9 written
2012, ALVL, 2, 1 written
2012, ALVL, 2, 10 written
2012, ALVL, 2, 2 written
2012, ALVL, 2, 3 written
2012, ALVL, 2, 4 written
2012, ALVL, 2, 5 written
2012, ALVL, 2, 6 written
2012, ALVL, 2, 7 written
2012, ALVL, 2, 8 written
2012, ALVL, 2, 9 written
2013, ALVL, 1, 1 written
2013, ALVL, 1, 10 written
2013, ALVL, 1, 11 written
2013, ALVL, 1, 2 written
2013, ALVL, 1, 3 written
2013, ALVL, 1, 4 written
2013, ALVL, 1, 5 written
2013, ALVL, 1, 6 written
2013, ALVL, 1, 7 written
2013, ALVL, 1, 8 written
2013, ALVL, 1, 9 written
2013, ALVL, 2, 1 written
2013, ALVL, 2, 10 written
2013, ALVL, 2, 11 written
2013, ALVL, 2, 12 written
2013, ALVL, 2, 2 written
2013, ALVL, 2, 3 written
2013, ALVL, 2, 4 written
2013, ALVL, 2, 5 

# Inserting Prediction into a Database 

In [66]:
import csv

with open('classifier_all-distilroberta-v1.csv') as f:
    dr = csv.DictReader(f)
    to_db = [(i)]

<csv.DictReader object at 0x00000219640C6950>


In [None]:
import sqlite3

conn = sqlite3.connect('classifiers.db')
cursor = conn.cursor()
cursor.execute()

conn.close()

# Evaluating Metrics

## Importing Database with Predicted Topics into Pandas DataFrame

In [109]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('classifiers.db')

with open('combiner.sql') as f:
    sql_script = f.read()

df = pd.read_sql_query(sql_script, conn)

null_df=df[df['TOPIC']=='']

print(null_df)


df = df[~df['TOPIC'].isnull()]

print(df)

conn.close()

no_of_labelled_qns = df.shape[0]

Empty DataFrame
Columns: [YEAR, PAPER, PAPER NO, QUESTION NO, TOPIC, ADA_TOPIC, GPT_TOPIC, all-distilroberta-v1_TOPIC, all-MiniLM-L12-v1_TOPIC, all-mpnet-base-v2_TOPIC, gtr-t5-base_TOPIC, sentence-t5-base_TOPIC]
Index: []
     YEAR  PAPER  PAPER NO  QUESTION NO   
0    2013   ALVL         1            1  \
1    2013   ALVL         2            1   
2    2013   ALVL         1            2   
3    2013   ALVL         2            2   
4    2013   ALVL         1            3   
..    ...    ...       ...          ...   
168  2021   ALVL         2            9   
169  2021   ALVL         1           10   
170  2021   ALVL         2           10   
171  2021   ALVL         1           11   
172  2021   ALVL         2           11   

                                              TOPIC   
0                        Equations and Inequalities  \
1                                         Functions   
2                                          Graphing   
3              Differentiation and its Ap

In [126]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

classifier_names = list(df)[5:]

with open('./static/topic_list.txt') as f:
    topic_list = list(f.readlines())
    topic_list = [i.strip() for i in topic_list]

print(topic_list)

for cl in classifier_names:
    df_to_use = df[~df[cl].isnull()]
    accuracy = accuracy_score(df_to_use['TOPIC'], df_to_use[cl])
    print(cl)
    precision, recall, fone, support = score(df_to_use['TOPIC'], df_to_use[cl],average=None,labels = topic_list,)
    print(precision)
    print(recall)
    print(fone)
    print(support)
    print('-------------------------------------------')

['Graphing', 'Functions', 'Equations and Inequalities', 'Sequences and Series', 'Arithmetic and Geometric Progression Series', 'Differentiation and its Applications', "Binomial Expansion, Maclaurin's Series and Small Angle Approximations", 'Integration', 'Applications of Integrations - Area and Volume', 'Differential Equations', 'Vectors', 'Complex Numbers', 'Permutations and Combinations', 'Probability', 'Discrete Random Variables', 'Binomial and Normal Distributions', 'Hypothesis Testing', 'Correlation and Regression']
ADA_TOPIC
[0.         0.         0.23076923 0.         0.4375     0.
 0.5        0.         0.35294118 0.18518519 1.         0.86666667
 0.66666667 0.66666667 0.         0.64285714 1.         0.85714286]
[0.         0.         0.375      0.         0.7        0.
 0.71428571 0.         0.54545455 1.         0.1875     1.
 0.66666667 0.88888889 0.         0.81818182 0.625      0.75      ]
[0.         0.         0.28571429 0.         0.53846154 0.
 0.58823529 0.         0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
