# Automatic Essay Scoring

In [14]:
import numpy as np
import pandas as pd
import gensim 
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

stopwords = set(stopwords.words('english'))

First, we can set up the dataframes and explore the data. We will drop columns that we don't need and those with NaN values. There was one row without a domain1_score, which I removed. Some essays also contained domain2 or domain3 scores, but since not all the data has that field, I will ignore that for now.

In [15]:
data = pd.ExcelFile('./data/training_set_rel3.xls')
df = data.parse("training_set")
df = df.drop('rater1_domain1', 1)
df = df.drop('rater2_domain1', 1)
df = df.dropna(axis = 1)

df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [16]:
essays = df['essay']
essays[0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

Here, we can see that personally identifiying information has been replaces with @NER where NER is a NER tag. We can remove these symbols to avoid interferring with the spell checking counts.

In [42]:
def replace_punc(text):
    return text.replace("@", "").replace("%", "")

Features #TODO write description of features

In [8]:
# Setup pre-trained word2vec model
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
#model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)

In [26]:
def essay_2_vec(essay):
    words = [w for w in essay if not w in stopwords]
    words = [w for w in essay if w.isalpha()]
    vector = []
    for w in words:
        try:
            vector.append(model[w])
        except:
            continue
    vector = np.array(vector)
    v = vector.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


num_rows = df.shape[0]
essays = df['essay'].values

#initialize dataframe columns
df['word_count'] = np.nan 
df['sentence_count'] = np.nan
df['avg_word_length'] = np.nan 
df['num_exclamation_marks'] = np.nan
df['num_question_marks'] = np.nan
df['num_stopwords'] = np.nan
df['word2vec'] = np.nan

# TODO
#df['num_advanced_words'] = np.nan
#df['spelling_errors'] = np.nan

for i in range(num_rows):
    
    # Turn essay into list of words
    text = essays[i].split(" ")
    
    # Set word count
    df.set_value(i,'word_count', len(text))
    
    # Sentence count
    df.set_value(i, 'sentence_count', len(nltk.tokenize.sent_tokenize(essays[i])))
    
    # Average word length
    word_len = sum(len(word) for word in text) / len(text)
    df.set_value(i, 'avg_word_length', word_len)
    
    # Number of exclamation marks
    df.set_value(i, "num_exclamation_marks", sum(word.count("!") for word in essays[i]))
    
    # Number of question marks
    df.set_value(i, "num_question_marks", sum(word.count("?") for word in essays[i]))
    
    # Number of stop words
    df.set_value(i, "num_stopwords", sum([1 for word in text if word.lower() in stopwords]))

    # Word2Vec conversion
    x = essay_2_vec(essays[i])
    word2vec_avg = sum(list(x)) / len(list(x))
    df.set_value(i,'word2vec', word2vec_avg)
    



### Text data

We need to convert the essay strings into some numerical form. We could use Word2Vec, a TF-IDF Vectorizer, etc.

In [27]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,word_count,sentence_count,avg_word_length,num_exclamation_marks,num_question_marks,num_stopwords,word2vec
0,1,1,"Dear local newspaper, I think effects computer...",8,338.0,16.0,4.550296,4.0,2.0,168.0,-0.003144
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,419.0,20.0,4.463007,1.0,1.0,189.0,-0.002944
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,279.0,14.0,4.526882,0.0,0.0,140.0,-0.003154
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,524.0,27.0,5.041985,2.0,1.0,222.0,-0.003218
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,465.0,30.0,4.526882,0.0,0.0,236.0,-0.003019


We have added all the features, so now we can start to explore correlations between features and scores (to ensure we are making correct assumptions and to discover potential new features), and perform the logistic regression.

In [37]:
x = df[['word_count','sentence_count','avg_word_length','num_exclamation_marks', 'num_question_marks', 'num_stopwords', 'word2vec']]
y = df['domain1_score']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
predictions = logistic_reg.predict(X_test)
print('Logistic regression classifier accuracy: {:.2f}'.format(logistic_reg.score(X_test, y_test)))

Logistic regression classifier accuracy: 0.42
