In [61]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
# import spacy
from sklearn.decomposition import PCA
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SANKARASUBRAMANIYAN\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

# Utility Functions

In [62]:
'''
    input  :  word list of a sentence
    output :  processed word list of the sentence
'''

def remove_stopwords(sentence): 
    processed = []
    for word in sentence:
        if word not in stopwords.words('english'):
            processed.append(word)
    return processed

In [63]:
def doc_to_vec(vect, word_list, size, model):
    # Performing Vector Addition
    processed = simple_preprocess(vect.lower())
    docVector = np.zeros(size)
    for word in processed:
        if word in word_list:
            docVector = np.add(docVector, model.wv.get_vector(word))
    return docVector

In [64]:
def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between A and B.
    '''
    
    dot = np.dot(A,B)
    norma = np.sqrt(np.dot(A,A))
    normb = np.sqrt(np.dot(B,B))
    cos = dot / (norma*normb)

    return cos

# Word2Vec

In [65]:
df = pd.read_csv('../OOPS DataSet Capstone - Sheet1.csv')
sentences = []
for index, value in df['text'].items():
    lines = value.split('.')
    for l in lines:
        line = l.strip()
        line = list(set(simple_preprocess(line)))
        sentences.append(line)
sentences = [ele for ele in sentences if ele != []]
sentences = [remove_stopwords(ele) for ele in sentences]
processed_sentences = sentences

In [66]:
word2vec = Word2Vec(window=10,
    min_count=2,
    workers=4)
word2vec.build_vocab(processed_sentences, progress_per=1000)
word2vec.train(processed_sentences, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)

word_list = word2vec.wv.vocab
word_list = list(word_list.keys())
X = []
for word in word_list:
    X.append(word2vec.wv.get_vector(word))

In [67]:
word2vec.wv.get_vector('object')

array([ 3.1603856e-03,  1.5921918e-03, -1.2394958e-03,  1.7883796e-03,
       -2.8283536e-03, -3.3068170e-03, -1.8937411e-03,  4.6930420e-03,
        5.5075935e-03, -2.2822230e-03, -3.8919998e-03, -4.0077786e-03,
       -4.1251937e-03,  4.4413013e-03, -1.2605322e-04,  1.1332654e-04,
        1.8688955e-03,  4.0708045e-03,  2.3878686e-04, -9.8954581e-05,
       -2.1594528e-03, -2.1043813e-04,  4.5751571e-03, -3.8466335e-03,
       -9.6808595e-04,  2.6526139e-03,  2.8365059e-03, -1.5282293e-03,
       -2.0992036e-03,  4.0526828e-03, -2.6974308e-03, -1.4509683e-03,
        2.3959980e-03,  1.8345255e-03,  7.0129480e-04,  1.5585661e-03,
        4.1427296e-03,  1.7751551e-03,  4.0892488e-04, -2.3791678e-03,
       -3.6442443e-04, -3.5762172e-03, -2.1894162e-03, -3.9240546e-03,
        4.3779477e-03, -3.3614188e-03,  1.1156620e-03,  4.0065576e-03,
        2.5993821e-03, -2.3393170e-03,  6.1229384e-04, -1.2450933e-03,
        3.4097352e-03, -1.6111290e-03, -3.4407666e-03,  1.2851272e-04,
      

In [68]:
def word2vec_similarity(answer_key, student_answer):
    VECTOR_SIZE = len(X[0])
    answer_key_vector = doc_to_vec(answer_key, word_list, VECTOR_SIZE, word2vec)
    student_answer_vector = doc_to_vec(student_answer, word_list, VECTOR_SIZE, word2vec)
    sim_score = cosine_similarity(answer_key_vector, student_answer_vector)*100
    return sim_score

# Doc2Vec

In [69]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SANKARASUBRAMANIYAN\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


True

In [70]:
train_data = df['text'].values
topics = df['topic'].values

sentences = []
for index, value in df['text'].items():
    lines = value.split('.')
    for l in lines:
        line = l.strip()
        sentences.append(line)

train_data = [ele for ele in sentences if ele != [] and ele !='' and ele!=' ']

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(train_data)]

In [71]:
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                negative=5,
                dm =1) # dm=1 means distributed memory PV-DM
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    # print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha



In [72]:
def doc2vec_similarity(answer_key, student_answer):
    A = model.infer_vector(answer_key.split())
    B = model.infer_vector(student_answer.split())
    return cosine_similarity(A,B)*100

# Flask API

In [73]:
import flask
from flask import request, jsonify
import requests
import json

In [74]:
from flask_cors import CORS
app = flask.Flask(__name__)
CORS(app)

<flask_cors.extension.CORS at 0x1866f8cfec8>

In [75]:
@app.route('/', methods=['GET'])
def welcome():
    return "<h1>Auto Grading</h1> <br/> <p> This site is a prototype API for <h3>Capstone Design Project</h3> </p>"

In [76]:
@app.route('/grades/word2vec', methods=['GET', 'POST'])
def word2vec_grade():
    if request.method == 'GET':
        return "<h1>Grading API</h1> <br/>"  
    elif request.method == 'POST':
#         doc_key = request.form['doc_key']
#         doc_ans = request.form['doc_ans']
        json_data = request.get_json()
        doc_key = json_data['given_ans']
        doc_ans = json_data['student_ans']
        grade = word2vec_similarity(doc_key, doc_ans)
        response = {'grader':'word2vec','grade':grade}
        return json.dumps(response)

In [77]:
@app.route('/grades/doc2vec', methods=['GET', 'POST'])
def doc2vec():
    if request.method == 'GET':
        return "<h1>Grading API</h1> <br/>"  
    elif request.method == 'POST':
        json_data = request.get_json()
        doc_key = json_data['given_ans']
        doc_ans = json_data['student_ans']
        grade = doc2vec_similarity(doc_key, doc_ans)
        response = {'grader':'doc2vec','grade':grade}
        return json.dumps(response)

In [78]:
app.run(port=4458)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:4458/ (Press CTRL+C to quit)
127.0.0.1 - - [17/Dec/2021 19:37:46] "OPTIONS /grades/word2vec HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2021 19:37:46] "POST /grades/word2vec HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2021 19:37:50] "OPTIONS /grades/doc2vec HTTP/1.1" 200 -
127.0.0.1 - - [17/Dec/2021 19:37:50] "POST /grades/doc2vec HTTP/1.1" 200 -


In [40]:
word2vec_similarity(
    "Polymorphism is the method in an object-oriented programming language that performs different things as per the object which calls it", 
    "Polymorphism performs a single action in different ways as per the object")

62.28183609051862

In [None]:
list_of_words = "Polymorphism is the method in an object-oriented programming language that performs different things as per the object which calls it".split()
len(model.infer_vector(list_of_words))

In [None]:
list_of_words = "Polymorphism performs a single action in different ways as per the object".split()
len(model.infer_vector(list_of_words))

In [None]:
doc2vec_similarity("Class is user-defined data types that act as the blueprint for individual objects, attributes, and methods", 
    "A class is a blueprint for creating objects, providing initial values for state and implementations of behavior")

In [25]:
word2vec.wv.get_vector(word)

AttributeError: 'function' object has no attribute 'wv'

In [26]:
gensim.model__version__

NameError: name 'gensim' is not defined