# Automatic Essay Scoring

In [1]:
import numpy as np
import pandas as pd
import gensim 
from gensim.models.doc2vec import Doc2Vec
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score

stopwords = set(stopwords.words('english'))



First, we can set up the dataframes and explore the data. We will drop columns that we don't need and those with NaN values. There was one row without a domain1_score, which I removed. Some essays also contained domain2 or domain3 scores, but since not all the data has that field, I will ignore that for now.

In [2]:
data = pd.ExcelFile('./data/training_set_rel3.xls')
df = data.parse("training_set")
df = df.drop('rater1_domain1', 1)
df = df.drop('rater2_domain1', 1)
df = df.dropna(axis = 1)

df

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
5,6,1,"Dear @LOCATION1, I think that computers have a...",8
6,7,1,Did you know that more and more people these d...,10
7,8,1,@PERCENT1 of people agree that computers make ...,10
8,9,1,"Dear reader, @ORGANIZATION1 has had a dramatic...",9
9,10,1,In the @LOCATION1 we have the technology of a ...,9


In [3]:
# Normalize all scores since each essay set has a different range and standard deviation
df['normalized_score']=np.nan
df.groupby('essay_set')['domain1_score'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
essay_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1783.0,8.528323,1.538565,2.0,8.0,8.0,10.0,12.0
2,1800.0,3.415556,0.774512,1.0,3.0,3.0,4.0,6.0
3,1726.0,1.848204,0.815157,0.0,1.0,2.0,2.0,3.0
4,1771.0,1.431395,0.940133,0.0,1.0,1.0,2.0,3.0
5,1805.0,2.408864,0.970821,0.0,2.0,2.0,3.0,4.0
6,1800.0,2.72,0.97063,0.0,2.0,3.0,3.0,4.0
7,1569.0,16.06246,4.58535,2.0,13.0,16.0,19.0,24.0
8,723.0,36.950207,5.753502,10.0,33.0,37.0,40.0,60.0


In [4]:
len(df)

12977

In [5]:
from sklearn import preprocessing

def normalized(df):
    max_range = df['domain1_score'].max()
    min_range = df['domain1_score'].min()
    df.normalized_score = (df['domain1_score'] - min_range) / (max_range - min_range)
    return df

# Normalize each essay set
set_1 = df[df['essay_set'].values == 1]
set_1 = normalized(set_1)

set_2 = df[df['essay_set'].values == 2]
set_2 = normalized(set_2)

set_3 = df[df['essay_set'].values == 3]
set_3 = normalized(set_3)

set_4 = df[df['essay_set'].values == 4]
set_4 = normalized(set_4)

set_5 = df[df['essay_set'].values == 5]
set_5 = normalized(set_5)

set_6 = df[df['essay_set'].values == 6]
set_6 = normalized(set_6)

set_7 = df[df['essay_set'].values == 7]
set_7 = normalized(set_7)

set_8 = df[df['essay_set'].values == 8]
set_8 = normalized(set_8)

df = pd.concat([set_1, set_2, set_3, set_4, set_5, set_6, set_7, set_8])

df.groupby('essay_set')['domain1_score'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
essay_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1783.0,8.528323,1.538565,2.0,8.0,8.0,10.0,12.0
2,1800.0,3.415556,0.774512,1.0,3.0,3.0,4.0,6.0
3,1726.0,1.848204,0.815157,0.0,1.0,2.0,2.0,3.0
4,1771.0,1.431395,0.940133,0.0,1.0,1.0,2.0,3.0
5,1805.0,2.408864,0.970821,0.0,2.0,2.0,3.0,4.0
6,1800.0,2.72,0.97063,0.0,2.0,3.0,3.0,4.0
7,1569.0,16.06246,4.58535,2.0,13.0,16.0,19.0,24.0
8,723.0,36.950207,5.753502,10.0,33.0,37.0,40.0,60.0


In [6]:
df

Unnamed: 0,essay_id,essay_set,essay,domain1_score,normalized_score
0,1,1,"Dear local newspaper, I think effects computer...",8,0.60
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.70
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,0.50
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.80
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,0.60
5,6,1,"Dear @LOCATION1, I think that computers have a...",8,0.60
6,7,1,Did you know that more and more people these d...,10,0.80
7,8,1,@PERCENT1 of people agree that computers make ...,10,0.80
8,9,1,"Dear reader, @ORGANIZATION1 has had a dramatic...",9,0.70
9,10,1,In the @LOCATION1 we have the technology of a ...,9,0.70


In [7]:
essays = df['essay']
essays[0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

Here, we can see that personally identifiying information has been replaces with @NER where NER is a NER tag. We can remove these symbols to avoid interferring with the spell checking counts.

In [8]:
# Function to get all text from each essay - to build doc2vec
def all_essays(df):
    for (i, essay) in enumerate(df['essay']):
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(essay), [i])
        

all_essay_lst = all_essays(df)
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(all_essay_lst)
%time model.train(all_essay_lst, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 50 ms, sys: 17.6 ms, total: 67.6 ms
Wall time: 61.8 ms


Features #TODO write description of features

In [9]:
# Setup pre-trained word2vec model
#model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
#model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)

In [16]:
num_rows = df.shape[0]
essays = df['essay'].values

#initialize dataframe columns
df['word_count'] = np.nan 
df['sentence_count'] = np.nan
df['avg_word_length'] = np.nan 
df['num_exclamation_marks'] = np.nan
df['num_question_marks'] = np.nan
df['num_stopwords'] = np.nan
df['word2vec_concat'] = np.nan

df['noun_count'] = np.nan
df['verb_count'] = np.nan
df['foreign_count'] = np.nan
df['adj_count'] = np.nan
df['conj_count'] = np.nan
df['adv_count'] = np.nan


def replace_punc(text):
    return text.replace("@", "").replace("%", "")

def get_pos_tags(essay):
    nouns = verbs = foreign = adj = adv = conj = 0
    tokens = nltk.word_tokenize(essay)
    for token in tokens:
        pos_tag = nltk.pos_tag(nltk.word_tokenize(token))
        for (_, tag) in (pos_tag):
            if tag[0] == "N":
                nouns += 1
            elif tag[0] == "V":
                verbs += 1
            elif tag[0:2] == "FW":
                foreign += 1
            elif tag[0] == "J":
                adj += 1
            elif tag[0] == "R":
                adv += 1
            elif tag[0:2] == "CC" or tag[0:2] == "IN":
                conj += 1
    
    return [nouns, verbs, foreign, adj, adv, conj]


for i in range(num_rows):
    
    # Remove placeholders
    text = replace_punc(essays[i])
    
    # Turn essay into list of words
    text = essays[i].split(" ")
    
    # Set word count
    df.set_value(i,'word_count', len(text))
    
    # Sentence count
    df.set_value(i, 'sentence_count', len(nltk.tokenize.sent_tokenize(essays[i])))
    
    # Average word length
    word_len = sum(len(word) for word in text) / len(text)
    df.set_value(i, 'avg_word_length', word_len)
    
    # Number of exclamation marks
    df.set_value(i, "num_exclamation_marks", sum(word.count("!") for word in essays[i]))
    
    # Number of question marks
    df.set_value(i, "num_question_marks", sum(word.count("?") for word in essays[i]))
    
    # Number of stop words
    df.set_value(i, "num_stopwords", sum([1 for word in text if word.lower() in stopwords]))

    # Word2Vec conversion - min + max
    df.set_value(i, 'word2vec_concat', min(model.docvecs[i]) + max(model.docvecs[i]))
    
    # POS tag counts
    pos_lst = get_pos_tags(essays[i])
    df.set_value(i,'noun_count', pos_lst[0])
    df.set_value(i,'verb_count', pos_lst[1])
    df.set_value(i,'foreign_count', pos_lst[2])
    df.set_value(i,'adj_count', pos_lst[3])
    df.set_value(i,'adv_count', pos_lst[4])
    df.set_value(i,'conj_count', pos_lst[5])



In [17]:
def get_tfidf_vectors(essays):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.9, min_df=5, max_features=400, stop_words="english", binary=True)
    tfidf_vectors = vectorizer.fit_transform(essays)
    new_df = pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names())
    
    return pd.concat([df, new_df], axis=1)

df = get_tfidf_vectors(essays)
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,normalized_score,word_count,sentence_count,avg_word_length,num_exclamation_marks,num_question_marks,...,world,wouldn,write,writing,wrong,year,years,yes,york,young
0,1,1,"Dear local newspaper, I think effects computer...",8,0.6,338.0,16.0,4.550296,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.7,419.0,20.0,4.463007,1.0,1.0,...,0.121001,0.151485,0.173362,0.173767,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,0.5,279.0,14.0,4.526882,0.0,0.0,...,0.12576,0.0,0.0,0.0,0.15899,0.0,0.0,0.0,0.0,0.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.8,524.0,27.0,5.041985,2.0,1.0,...,0.111728,0.0,0.160076,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,0.6,465.0,30.0,4.526882,0.0,0.0,...,0.116441,0.145776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Text data

We need to convert the essay strings into some numerical form. We could use Word2Vec, a TF-IDF Vectorizer, etc.

We have added all the features, so now we can start to explore correlations between features and scores (to ensure we are making correct assumptions and to discover potential new features), and perform the logistic regression.

In [None]:
'''         
Worsened: 'word2vec_avg', 'noun_count','adj_count', 'adv_count', 'foreign_count',
'''
from sklearn.model_selection import StratifiedKFold

x = df.drop(['domain1_score', 'normalized_score', 'essay'], axis=1)
# Can't test for normalized score on linear regression due to floats (must be int)
# Try other regressors?
y = df['domain1_score']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

'''
# 5 fold cross validation to avoid overfitting
x = np.array(x)
y = np.array(y)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, test_index in kfold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
'''
    
    
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

In [13]:
predictions = logistic_reg.predict(X_test)
print('Logistic regression classifier accuracy: {:.2f}'.format(logistic_reg.score(X_test, y_test)))

Logistic regression classifier accuracy: 0.46


In [14]:
print(cohen_kappa_score(predictions, y_test, weights="quadratic"))

0.7850311982617879
