# Coding Question Difficulty Prediction

This notebook is a project with the aim of predicting difficulty in answering competitive coding questions using labelled data from [codeforces.com](http://codeforces.com/).

It classifies problems into three levels of difficulty- Easy, Medium, and Hard from a dataset of around 2000 questions containing - The question text, the input, and the output specifications

## Imports

In [44]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import keras.utils

import nltk
import glob
import os.path
import pprint

## Data Processing

In [45]:
def tokenize_and_lower(string):
    return " ".join(nltk.word_tokenize(string)).lower()

In [46]:
def load_data():
    '''
    reads all question files and returns - 
    q_text, input, output, label
    '''

    complexities = dict()
    complexity_file = open("questions-complexity.csv", encoding='utf-8')
    complexity_file.readline()

    for line in complexity_file:
        line = line.strip().split(",")
        complexities[line[0]] = line[-2]
    complexity_file.close()


    question_files = sorted(glob.glob("./questions/*.txt"))
    questions = []
    inputs = []
    outputs = []
    labels = []

    for f in question_files:
        handle = open(f, encoding='utf-8')
        text = handle.read()
        handle.close()
        text_split = text.split("\n\n")
        question = text_split[2]	### + [3] and [4] for Input and Output requirement text
        input_text = text_split[3]
        output_text = text_split[4]

        # Removes 'Input' and 'Output' prefixes
        input_text = input_text[len('Input'):]
        output_text = output_text[len('Output'):]

        question = tokenize_and_lower(question)
        input_text = tokenize_and_lower(input_text)
        output_text = tokenize_and_lower(output_text)

        questions.append(question)
        inputs.append(input_text)
        outputs.append(output_text)
        labels.append(complexities[os.path.basename(f).strip(".txt")])

    return questions, inputs, outputs, labels

In [47]:
def populate_word_frequencies(word_freq, text):
    for token in nltk.word_tokenize(text):
        if token not in word_freq:
            word_freq[token] = 0
        else:
            word_freq[token] += 1
    return word_freq

In [48]:
def create_vocab(questions, inputs, outputs):
    '''
    takes list of questions (returned from load_data) and returns a list of words sorted in desending order of frequency
    '''
    word_freq = dict()
    for ques in questions:
        populate_word_frequencies(word_freq, ques)
            
    for input_text in inputs:
        populate_word_frequencies(word_freq, input_text)
        
    for output_text in outputs:
        populate_word_frequencies(word_freq, output_text)
        
    top_words = sorted(word_freq, key = lambda w : word_freq[w], reverse = True)
    vocab_size = len(top_words)

    return top_words, vocab_size

In [49]:
def get_frequency_vector(text, vocab):
    vector = []
    for token in nltk.word_tokenize(text):
        vector.append(vocab.index(token))
    return vector

In [50]:
def vectorize_data(questions, labels, vocab):
    question_vectors = []
    input_vectors = []
    output_vectors = []
    
    for ques in questions:
        question_vectors.append(get_frequency_vector(ques, vocab))

    for input_text in inputs:
        input_vectors.append(get_frequency_vector(input_text, vocab))
        
    for output_text in outputs:
        output_vectors.append(get_frequency_vector(output_text, vocab))
        
    labels = [0 if l == "Easy" else l for l in labels]
    labels = [1 if l == "Medium" else l for l in labels]
    labels = [2 if l == "Hard" else l for l in labels]

    labels_vector = keras.utils.to_categorical(labels)

    return question_vectors, input_vectors, output_vectors, labels_vector

In [51]:
questions, inputs, outputs, labels = load_data()
vocab, vocab_size = create_vocab(questions, inputs, outputs)
question_vectors, input_vectors, output_vectors, labels = vectorize_data(questions, labels, vocab)

question_vectors_train, input_vectors_train, output_vectors_train, labels_train = question_vectors[:1700], input_vectors[:1700], output_vectors[:1700], labels[:1700]
question_vectors_test, input_vectors_test, output_vectors_test, labels_test = question_vectors[1700:], input_vectors[1700:], output_vectors[1700:], labels[1700:]



## Config

In [52]:
max_question_length = 300
max_input_length = 50
max_output_length = 50
embedding_vector_length = 32
n_epochs = 5

## Training

In [74]:
def build_model(question_vectors_train, input_vectors_train, output_vectors_train, labels_train, vocab_size, max_question_length, max_input_length, max_output_length, embedding_vector_length, n_epochs):
    question_vectors_sequence = sequence.pad_sequences(question_vectors_train, maxlen=max_question_length)
    input_vectors_sequence = sequence.pad_sequences(input_vectors_train, maxlen=max_input_length)
    output_vectors_sequence = sequence.pad_sequences(output_vectors_train, maxlen=max_output_length)
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length))

    model.add(LSTM(10))

    model.add(Dense(3, activation = "softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    X = np.concatenate([question_vectors_sequence, input_vectors_sequence, output_vectors_sequence], axis=1)
    model.fit(X, labels_train, epochs=n_epochs, batch_size=64, verbose=1)

    print('SUMMARY:', model.summary())
    return model

In [70]:
np.array(question_vectors_train).shape

(1700,)

In [None]:
model = build_model(question_vectors_train, input_vectors_train, output_vectors_train, labels_train, vocab_size, max_question_length, max_input_length, max_output_length, embedding_vector_length, n_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5

## Testing

In [85]:
def test_model(model, question_vectors_test, input_vectors_test, output_vectors_test, labels_test, max_question_length, max_input_length, max_output_length):
    question_vectors_sequence = sequence.pad_sequences(question_vectors_test, maxlen=max_question_length)
    input_vectors_sequence = sequence.pad_sequences(input_vectors_test, maxlen=max_input_length)
    output_vectors_sequence = sequence.pad_sequences(output_vectors_test, maxlen=max_output_length)

    X = np.concatenate([question_vectors_sequence, input_vectors_sequence, output_vectors_sequence], axis=1)
    scores = model.evaluate(X, labels_test)
    print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
test_model(model, question_vectors_test, input_vectors_test, output_vectors_test, labels_test, max_question_length, max_input_length, max_output_length)