In [30]:
import csv
import json
import numpy as np
import pandas as pd
import gensim.downloader

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler

In [15]:
word_to_vec = gensim.downloader.load("glove-wiki-gigaword-100")

In [16]:
raw_data_fd = open('raw_data.json')
raw_data = json.load(raw_data_fd)

In [17]:
# Feature Creation Fn's
def example(row):
    print(row.name)
    return row


def question_matching(row):
    keywords = {
        'q0': set(['load', 'dataset', 'csv', 'file']),
        'q1': set(['shape', 'summary', 'head', 'map', 'missing', 'label']),
        'q2': set(['shuffle', 'seperate', 'split', 'training', '80', '20']),
        'q3': set(['correlation', 'feature', 'selection', 'hypothetical']),
        'q4': set(['hyperparameter', 'tune', 'gridsearchcv']),
        'q5': set(['retrain', 'hyperparameter', 'decision', 'tree', 'plot']),
        'q6': set(['predict', 'classification', 'accuracy', 'confusion', 'matrix']),
        'q7': set(['information', 'gain', 'entropy', 'formula'])
    }
    name = row.name
    prompt_answer_pairs = raw_data.get(name)

    question_dict = {key: 0 for key in keywords}
    for pair in prompt_answer_pairs:
        prompt_set = set(pair[0].split())
        match_counts = {key: 0 for key in keywords}

        for question_key, keywords_set in keywords.items():
            match_counts[question_key] += len(prompt_set.intersection(keywords_set))

        max_match = max(match_counts.values())
        for key, value in match_counts.items():
            if value == max_match:
                question_dict[key] += 1

    for i in range(0, 8):
        row[f'question_match_{i}'] = question_dict[f'q{i}']

    return row


def length_and_count(row):
    prompt_sum_of_words = 0
    answer_sum_of_words = 0
    for prompt, answer in raw_data[row.name]:
        prompt_sum_of_words += len(prompt)
        answer_sum_of_words += len(answer)

    pair_count = len(raw_data[row.name])
    row['pair_count'] = pair_count
    row['avg_prompt_length'] = prompt_sum_of_words / pair_count
    row['avg_answer_length'] = answer_sum_of_words / pair_count

    return row


def vectorized_prompts(row):
    key = row.name
    prompt_answer_pairs = raw_data[key]
    prompt_vector = np.zeros(word_to_vec.vector_size)

    for each_pair in prompt_answer_pairs:
        text = each_pair[0]
        words = text.split()
        word_vectors = []

        for word in words:
            if word in word_to_vec:
                word_vectors.append(word_to_vec[word])

        if word_vectors:  # Calculate the average of word vectors along the columns (axis=0)
            prompt_vector = np.mean(word_vectors, axis=0)

    for i, val in enumerate(prompt_vector):
        row[f"prompt_vector_{i}"] = prompt_vector[i]
    return row


def vectorized_answers(row):
    key = row.name
    prompt_answer_pairs = raw_data[key]
    prompt_vector = np.zeros(word_to_vec.vector_size)

    for each_pair in prompt_answer_pairs:
        text = each_pair[1]
        words = text.split()
        word_vectors = []

        for word in words:
            if word in word_to_vec:
                word_vectors.append(word_to_vec[word])

        if word_vectors:  # Calculate the average of word vectors along the columns (axis=0)
            prompt_vector = np.mean(word_vectors, axis=0)

    for i, val in enumerate(prompt_vector):
        row[f"answer_vector_{i}"] = prompt_vector[i]
    return row


# Row processing
def our_super_great_row_processor(row):
    row = question_matching(row)
    row = length_and_count(row)
    row = vectorized_prompts(row)
    row = vectorized_answers(row)
    return row

In [18]:
columns = [f"prompt_vector_{i}" for i in range(100)]
columns += [f"answer_vector_{i}" for i in range(100)]
columns += [f"question_match_{i}" for i in range(8)]
columns += ["pair_count", "avg_prompt_length", "avg_answer_length", "grade"]

dataframe = pd.DataFrame(index=raw_data.keys(), columns=columns)
dataframe.apply(our_super_great_row_processor, axis=1)

Unnamed: 0,prompt_vector_0,prompt_vector_1,prompt_vector_2,prompt_vector_3,prompt_vector_4,prompt_vector_5,prompt_vector_6,prompt_vector_7,prompt_vector_8,prompt_vector_9,...,question_match_2,question_match_3,question_match_4,question_match_5,question_match_6,question_match_7,pair_count,avg_prompt_length,avg_answer_length,grade
b73f91f8-732f-4a48-bcbd-eadbbb457a94,-0.105305,0.146404,0.244801,0.010134,-0.100589,0.013972,0.184923,0.107906,-0.105608,0.104179,...,6,7,4,5,5,3,17,455.941176,1580.352941,
746b8f06-1e89-43b8-b73c-1121eecfc854,-0.099691,0.317647,0.297639,-0.162343,-0.302724,0.090270,0.017619,0.268344,-0.027193,0.128519,...,1,1,1,1,1,1,1,101.000000,347.000000,
30283b91-7fc3-4125-985b-b441f0f489d6,-0.233918,0.285974,0.382238,0.136774,0.157068,0.194427,0.194111,0.320898,-0.298705,0.203647,...,4,5,4,5,4,5,14,509.285714,1653.642857,
ef5b3fbc-f5d2-4446-bb4f-7d8b2a3026e9,-0.247697,0.323605,0.214531,0.088791,-0.090610,0.141679,0.263739,0.114925,-0.068034,-0.129276,...,38,36,28,48,33,51,100,159.030000,1459.850000,
106ffe99-c787-4d09-9076-4ba411eb68b1,-0.418465,0.338650,0.194245,-0.311205,0.033230,-0.248920,0.005590,-0.200436,-0.139080,-0.193729,...,12,13,11,13,11,13,25,215.360000,1572.080000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2b9cf078-c56b-4020-9197-cd9f7d4f909c,-0.132189,0.299466,0.317161,-0.080648,-0.029858,0.324425,-0.277981,0.131381,-0.264373,0.161799,...,8,8,8,6,8,5,15,523.466667,1676.600000,
a70ebc32-7ee1-456f-9fa1-bef302fb0e78,-0.213674,0.248164,0.187931,-0.135470,0.145471,0.211850,0.077825,0.131165,-0.311816,0.076470,...,6,7,5,6,6,5,13,80.923077,1698.846154,
b24c3a33-2952-4ae4-9f2d-643d8fdbc600,-0.259950,0.331705,0.353328,-0.253111,-0.141396,0.189478,-0.271301,0.117444,-0.035993,-0.178271,...,26,18,20,30,26,17,59,631.525424,1275.474576,
6d5742c1-77c4-429c-8f6e-ef1262ca5557,-0.266226,0.349401,0.347730,-0.344043,-0.213856,0.288501,-0.077220,0.229046,-0.102184,-0.159140,...,36,35,37,41,37,34,66,240.515152,1277.787879,


In [19]:
grades_fd = open("./materials/scores.csv")
grades_csv_reader = csv.reader(grades_fd)

for i, row in enumerate(grades_csv_reader):
    if i > 0:
        key = row[1].strip()
        grade = float(row[2].strip())
        dataframe.at[key, 'grade'] = grade

In [20]:
dataframe.dropna(inplace=True)

In [21]:
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe = dataframe.reset_index().drop("index", axis=1)

In [22]:
train_data, test_data, train_labels, test_labels = train_test_split(
    dataframe.drop(columns=['grade']), dataframe['grade'], test_size=0.2, random_state=42
)

scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

train_data = np.asarray(train_data_scaled).astype(np.float32)
test_data = np.asarray(test_data_scaled).astype(np.float32)


train_labels = np.asarray(train_labels).astype(np.int32)
test_labels = np.asarray(test_labels).astype(np.int32)

In [33]:
model = Sequential()
model.add(Dense(1000, activation='relu', input_shape=(211,)))
model.add(Dense(10000, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 1000)              212000    
                                                                 
 dense_16 (Dense)            (None, 10000)             10010000  
                                                                 
 dense_17 (Dense)            (None, 1)                 10001     
                                                                 
Total params: 10232001 (39.03 MB)
Trainable params: 10232001 (39.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [35]:
model.fit(
    train_data,
    train_labels,
    epochs=14,
    batch_size=32,
    validation_split=0.2,
)

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


<keras.src.callbacks.History at 0x28b51eb30>