# Coding Question Difficulty Prediction

This notebook is a project with the aim of predicting difficulty in answering competitive coding questions using labelled data from [codeforces.com](http://codeforces.com/).

It classifies problems into three levels of difficulty- Easy, Medium, and Hard from a dataset of around 2000 questions containing - The question text, the input, and the output specifications

## Imports

In [8]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import keras.utils

import nltk
import json
import glob
import os.path
import pprint

In [9]:
GLOVE_PATH = 'glove_50_glove_100'

## Data Processing

In [10]:
def tokenize_and_lower(string):
    return " ".join(nltk.word_tokenize(string)).lower()

In [11]:
def load_data():
    '''
    reads all question files and returns - 
    q_text, input, output, label
    '''

    complexities = dict()
    complexity_file = open("questions-complexity.csv", encoding='utf-8')
    complexity_file.readline()

    for line in complexity_file:
        line = line.strip().split(",")
        complexities[line[0]] = line[-2]
    complexity_file.close()


    question_files = sorted(glob.glob("./questions/*.txt"))
    questions = []
    inputs = []
    outputs = []
    labels = []

    for f in question_files:
        handle = open(f, encoding='utf-8')
        text = handle.read()
        handle.close()
        text_split = text.split("\n\n")
        question = text_split[2]	### + [3] and [4] for Input and Output requirement text
        input_text = text_split[3]
        output_text = text_split[4]

        # Removes 'Input' and 'Output' prefixes
        input_text = input_text[len('Input'):]
        output_text = output_text[len('Output'):]

        question = tokenize_and_lower(question)
        input_text = tokenize_and_lower(input_text)
        output_text = tokenize_and_lower(output_text)

        questions.append(question)
        inputs.append(input_text)
        outputs.append(output_text)
        labels.append(complexities[os.path.basename(f).strip(".txt")])

    return questions, inputs, outputs, labels

## Feature Extraction

In [41]:
def get_categories(labels):
    labels = [0 if l == "Easy" else 1 if l == "Medium" else 2 if l == "Hard" else l for l in labels]
    labels_vector = keras.utils.to_categorical(labels)
    return labels

### Word Frequency feature extraction

In [12]:
def populate_word_frequencies(word_freq, text):
    for token in nltk.word_tokenize(text):
        if token not in word_freq:
            word_freq[token] = 0
        else:
            word_freq[token] += 1
    return word_freq

In [13]:
def create_vocab(questions, inputs, outputs):
    '''
    takes list of questions (returned from load_data) and returns a list of words sorted in desending order of frequency
    '''
    word_freq = dict()
    for ques in questions:
        populate_word_frequencies(word_freq, ques)
            
    for input_text in inputs:
        populate_word_frequencies(word_freq, input_text)
        
    for output_text in outputs:
        populate_word_frequencies(word_freq, output_text)
        
    top_words = sorted(word_freq, key = lambda w : word_freq[w], reverse = True)
    vocab_size = len(top_words)

    return top_words, vocab_size

In [14]:
def get_frequency_vector(text, vocab):
    vector = []
    for token in nltk.word_tokenize(text):
        vector.append(vocab.index(token))
    return vector

In [133]:
def get_frequency_vectors(questions, inputs, outputs, vocab):
    question_vectors = []
    input_vectors = []
    output_vectors = []
    
    for ques in questions:
        question_vectors.append(get_frequency_vector(ques, vocab))

    for input_text in inputs:
        input_vectors.append(get_frequency_vector(input_text, vocab))
        
    for output_text in outputs:
        output_vectors.append(get_frequency_vector(output_text, vocab))
        
    return question_vectors, input_vectors, output_vectors

### Word Embedding Feature Extraction

In [134]:
vecs = np.load("glove_50_glove_100/glove_vectors_100d.npy")
vecs50 = np.load("glove_50_glove_100/glove_vectors_50d.npy")

In [135]:
with open('glove_50_glove_100/words.txt', encoding='utf-8') as f:
    content = f.readlines()
words = [x.strip() for x in content] 

In [136]:
wordidx = json.load(open('glove_50_glove_100/wordsidx.txt'))

In [137]:
def get_embedding_vector(text, embedding_size, vecs, vecs50, wordidx):
    vector = []
    for token in nltk.word_tokenize(text.lower()):
        if embedding_size == '50':
            try:
                vector.append(vecs50[wordidx[token]])
            except:
                print('No vector present:', token)
        elif embedding_size == '100':
            try:
                vector.append(vecs[wordidx[token]])
            except:
                print('NO vector present:', token)
    #shouldn't be doing this
    return np.concatenate(vector) if vector else np.array([0.0])

In [138]:
x = get_embedding_vector('Hi how are you ?', '50', vecs, vecs50, wordidx)
len(x)
y = np.array(0)
y

array(0)

In [139]:
def get_embedding_vectors(questions, inputs, outputs, vecs, vecs50, wordidx, embedding_size='50'):
    question_vectors = []
    input_vectors = []
    output_vectors = []
    
    for ques in questions:
        question_vectors.append(get_embedding_vector(ques, embedding_size, vecs, vecs50, wordidx))
        
    for input_text in inputs:
        input_vectors.append(get_embedding_vector(input_text, embedding_size, vecs, vecs50, wordidx))
        
    for output_text in outputs:
        output_vectors.append(get_embedding_vector(output_text, embedding_size, vecs, vecs50, wordidx))
        
#     print('q:', question_vectors[0])
    return question_vectors, input_vectors, output_vectors

## Feature Selection

In [140]:
algorithm = 'embeddings'
algorithm = 'frequencies'

In [141]:
questions, inputs, outputs, labels = load_data()
labels = get_categories(labels)
vocab, vocab_size = create_vocab(questions, inputs, outputs)
if algorithm == 'frequencies':
#     vocab, vocab_size = create_vocab(questions, inputs, outputs)
    question_vectors, input_vectors, output_vectors = get_frequency_vectors(questions, inputs, outputs, vocab)
elif algorithm == 'embeddings':
    question_vectors, input_vectors, output_vectors = get_embedding_vectors(questions, inputs, outputs, vecs, vecs50, wordidx, embedding_size='50')
    
question_vectors_train, input_vectors_train, output_vectors_train, labels_train = question_vectors[:1700], input_vectors[:1700], output_vectors[:1700], labels[:1700]
question_vectors_test, input_vectors_test, output_vectors_test, labels_test = question_vectors[1700:], input_vectors[1700:], output_vectors[1700:], labels[1700:]


In [142]:
len(question_vectors_train[1])

77

## Config

In [143]:
max_question_length = 300
max_input_length = 50
max_output_length = 50
embedding_vector_length = 32
n_epochs = 5

## Training

In [144]:
def build_model(question_vectors_train, input_vectors_train, output_vectors_train, labels_train, vocab_size, max_question_length, max_input_length, max_output_length, embedding_vector_length, n_epochs):
    
    question_vectors_sequence = sequence.pad_sequences(question_vectors_train, maxlen=max_question_length)
    input_vectors_sequence = sequence.pad_sequences(input_vectors_train, maxlen=max_input_length)
    output_vectors_sequence = sequence.pad_sequences(output_vectors_train, maxlen=max_output_length)
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length))

    model.add(LSTM(50))

    model.add(Dense(3, activation = "softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    print('shapes', question_vectors_sequence.shape, input_vectors_sequence.shape, output_vectors_sequence.shape)
    print('q:', question_vectors_sequence[0])
    print('i:', input_vectors_sequence[0])
    print('o:', output_vectors_sequence[0])
    X = np.concatenate([question_vectors_sequence, input_vectors_sequence, output_vectors_sequence], axis=1)
    model.fit(X, labels_train, epochs=n_epochs, batch_size=64, verbose=1)

    print('SUMMARY:', model.summary())
    return model

In [146]:
model = build_model(question_vectors_train, input_vectors_train, output_vectors_train, labels_train, vocab_size, max_question_length, max_input_length, max_output_length, embedding_vector_length, n_epochs)

shapes (1700, 300) (1700, 50) (1700, 50)
q: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   

AttributeError: 'int' object has no attribute 'ndim'

## Testing

In [20]:
def test_model(model, question_vectors_test, input_vectors_test, output_vectors_test, labels_test, max_question_length, max_input_length, max_output_length):
    question_vectors_sequence = sequence.pad_sequences(question_vectors_test, maxlen=max_question_length)
    input_vectors_sequence = sequence.pad_sequences(input_vectors_test, maxlen=max_input_length)
    output_vectors_sequence = sequence.pad_sequences(output_vectors_test, maxlen=max_output_length)

    X = np.concatenate([question_vectors_sequence, input_vectors_sequence, output_vectors_sequence], axis=1)
    scores = model.evaluate(X, labels_test)
    print("Accuracy: %.2f%%" % (scores[1]*100))

In [21]:
test_model(model, question_vectors_test, input_vectors_test, output_vectors_test, labels_test, max_question_length, max_input_length, max_output_length)

Accuracy: 43.54%
