In [96]:
import csv
import json
import numpy as np
import pandas as pd
import gensim.downloader

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

In [97]:
word_to_vec = gensim.downloader.load("glove-wiki-gigaword-300")

raw_data_fd = open('raw_data.json')
raw_data = json.load(raw_data_fd)

In [98]:
# Feature Creation Fn's
def example(row):
    print(row.name)
    return row


def question_matching(row):
    keywords = {
        'q0': set(['load', 'dataset', 'csv', 'file']),
        'q1': set(['shape', 'summary', 'head', 'map', 'missing', 'label']),
        'q2': set(['shuffle', 'seperate', 'split', 'training', '80', '20']),
        'q3': set(['correlation', 'feature', 'selection', 'hypothetical']),
        'q4': set(['hyperparameter', 'tune', 'gridsearchcv']),
        'q5': set(['retrain', 'hyperparameter', 'decision', 'tree', 'plot']),
        'q6': set(['predict', 'classification', 'accuracy', 'confusion', 'matrix']),
        'q7': set(['information', 'gain', 'entropy', 'formula'])
    }
    name = row.name
    prompt_answer_pairs = raw_data.get(name)

    question_dict = {'q0': 0, 'q1': 0, 'q2': 0, 'q3': 0, 'q4': 0, 'q5': 0, 'q6': 0, 'q7': 0}
    for pair in prompt_answer_pairs:
        prompt_set = set(pair[0].split())
        match_counts = {key: 0 for key in keywords}

        for question_key, keywords_set in keywords.items():
            match_counts[question_key] += len(prompt_set.intersection(keywords_set))

        question_label = max(match_counts, key=match_counts.get)
        question_dict[question_label] += 1

    for i in range(0, 8):
        row[f'question_match_{i}'] = question_dict[f'q{i}']

    return row


def length_and_count(row):
    prompt_sum_of_words = 0
    answer_sum_of_words = 0
    for prompt, answer in raw_data[row.name]:
        prompt_sum_of_words += len(prompt)
        answer_sum_of_words += len(answer)

    pair_count = len(raw_data[row.name])
    row['pair_count'] = pair_count
    row['avg_prompt_length'] = prompt_sum_of_words / pair_count
    row['avg_answer_length'] = answer_sum_of_words / pair_count

    return row


def vectorized_prompts(row):
    key = row.name
    prompt_answer_pairs = raw_data[key]
    prompt_vector = np.zeros(word_to_vec.vector_size)

    for each_pair in prompt_answer_pairs:
        text = each_pair[0]
        words = text.split()
        word_vectors = []

        for word in words:
            if word in word_to_vec:
                word_vectors.append(word_to_vec[word])

        if word_vectors:  # Calculate the average of word vectors along the columns (axis=0)
            prompt_vector = np.mean(word_vectors, axis=0)

    for i, val in enumerate(prompt_vector):
        row[f"prompt_vector_{i}"] = prompt_vector[i]
    return row


def vectorized_answers(row):
    key = row.name
    prompt_answer_pairs = raw_data[key]
    prompt_vector = np.zeros(word_to_vec.vector_size)

    for each_pair in prompt_answer_pairs:
        text = each_pair[1]
        words = text.split()
        word_vectors = []

        for word in words:
            if word in word_to_vec:
                word_vectors.append(word_to_vec[word])

        if word_vectors:  # Calculate the average of word vectors along the columns (axis=0)
            prompt_vector = np.mean(word_vectors, axis=0)

    for i, val in enumerate(prompt_vector):
        row[f"answer_vector_{i}"] = prompt_vector[i]
    return row


# Row processing
def our_super_great_row_processor(row):
    row = question_matching(row)
    row = length_and_count(row)
    row = vectorized_prompts(row)
    return row

In [99]:
columns = [f"prompt_vector_{i}" for i in range(300)]
columns += [f"answer_vector_{i}" for i in range(300)]
columns += [f"question_match_{i}" for i in range(8)]
columns += ["pair_count", "avg_prompt_length", "avg_answer_length", "grade"]

dataframe = pd.DataFrame(index=raw_data.keys(), columns=columns)
dataframe.apply(our_super_great_row_processor, axis=1);

In [100]:
grades_fd = open("./materials/scores.csv")
grades_csv_reader = csv.reader(grades_fd)

for i, row in enumerate(grades_csv_reader):
    if i > 0:
        key = row[1].strip()
        grade = float(row[2].strip())
        dataframe.at[key, 'grade'] = grade

In [101]:
train_data, test_data, train_labels, test_labels = train_test_split(
    dataframe.drop(columns=['grade']), dataframe['grade'], test_size=0.2, random_state=42
)

train_data = np.asarray(train_data).astype(np.float32)
test_data = np.asarray(test_data).astype(np.float32)


train_labels = np.asarray(train_labels).astype(np.int32)
test_labels = np.asarray(test_labels).astype(np.int32)

In [104]:
model = Sequential()
model.add(Dense(10000, input_shape=(611,)))
model.add(Dense(10000, activation='tanh'))
model.add(Dense(10000, activation='tanh'))
model.add(Dense(1, activation='tanh'))
model.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mean_squared_error'])
model.summary()

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_76 (Dense)            (None, 10000)             6120000   
                                                                 
 dense_77 (Dense)            (None, 10000)             100010000 
                                                                 
 dense_78 (Dense)            (None, 10000)             100010000 
                                                                 
 dense_79 (Dense)            (None, 1)                 10001     
                                                                 
Total params: 206150001 (786.40 MB)
Trainable params: 206150001 (786.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(train_data, train_labels, epochs=50)