In [3]:
from utlis import *
import numpy as np
import pandas as pd
import time
import nltk
from collections import Counter
from nltk import word_tokenize, pos_tag
import string
import enchant
from enchant.checker import SpellChecker
import re

In [8]:

def _cleanText(t):
    '''
    t string, raw text input
    ret t string, a list of words
    '''
    t = t.lower()
    t = re.sub(r'[^\w\s]','',t)
    t = re.sub(r'\s*(\(\d)|(\))\s*', '', t)
    #t = t.split()
    return t

def _nltktag(text):
    """
    Using nltk.word_tokenize to tag words as 'NN', 'DT'
    for extracting noun, verb, adj
    """
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    return tagged_words

def _wordCount(text):
    """
    input: string 
    output: int -- Count of words
    """
    return sum(Counter(text.split()).values())

def _longWordCount(text):
    """
    input: string
    output: int -- Count of Long words
    
    """
    #Average word length without stop words is 5.6
    ##threshold = 6
    long_words = [word for word in text.split() if len(word)>6]
    return sum(Counter(long_words).values())

def _partOfSpeechCount(text):
    
    tagged_words = _nltktag(text)
    #Noun Count
    listnn = [w[0] for w in tagged_words if w[1] in ['NN', 'NNP', 'NNPS','NNS']]
    nnCount = sum(Counter(listnn).values())
    #Verb Count
    listvb = [w[0] for w in tagged_words if w[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    verbCount = sum(Counter(listvb).values())
    #Adjective Count
    listadj = [w[0] for w in tagged_words if w[1] in ['JJ', 'JJR', 'JJS']]
    adjCount = sum(Counter(listadj).values())
    #Adverb Count
    listadvb = [w[0] for w in tagged_words if w[1] in ['RR', 'RBR', 'RBS']]
    advbCount = sum(Counter(listadvb).values())
    return nnCount, verbCount, adjCount, advbCount

def _commaCount(text):
    return text.count(',')

def _punctuationCount(text):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(text,set(string.punctuation)) 

def _sentenceCount(text):
    return len(nltk.sent_tokenize(text))

def _wordLengthAvg(text):
    l = text.split()
    return sum(map(len, l))/float(len(l))



def _spellingError(text):
    """
    return: Count of misspelled words
    """
    my_dict = enchant.Dict("en_US")
    my_checker = SpellChecker(my_dict)
    my_checker.set_text(text)
    return len([error.word for error in my_checker])

def _lexicalDiversity(t):
    """
    t input seq, String
    ---------
    return float ratio
    """
    return len(set(t)) / len(t)

def _quotationMark(t):
    '''
    t string, raw input
    ret li, ceil of pairs of quatation contained in input text
    '''
    li = re.findall('"',t)
    n = len(li)
    n = int(np.ceil(n/2))
    return n
    
def _exclamationMarks(text):
    return text.count('!')

def _featureExtraction(text):
    """
    input: essay as a long string
    
    output:feature vector
    elements in output: 
    1. word count 
    2. long word count
    3. noun word count
    4. verb count
    5. comma count
    6. punctuation count
    7. sentence count
    8. adjective count
    9. adverb count
    10. lexical diversity
    11. quatation mark
    12. word length
    13. spelling error
    14*.bracket count
    15*.exclamation count
    16*. Foreign words count
    """
    wordCount = _wordCount(text)
    longWordCount = _longWordCount(text)
    nounCount, verbCount, adjCount, advbCount = _partOfSpeechCount(text)
    commaCount = _commaCount(text)
    puncCount = _punctuationCount(text)
    sentCount = _sentenceCount(text)
    lexDiv = _lexicalDiversity(text)
    quatMarkCount = _quotationMark(text)
    avgWordLen = _wordLengthAvg(text)
    spelErrorCount = _spellingError(text)
    #brcktCount = _br
    exclamationCount = _exclamationMarks(text)
    
    f = np.array([wordCount, longWordCount, nounCount, verbCount, commaCount, puncCount, sentCount, 
                 adjCount, advbCount, lexDiv, quatMarkCount, avgWordLen, spelErrorCount])
    
    f_res = _addedByStep(f)
    
    return f_res #feature vector

def _addedByStep(vec):
    """
    input: vec
    output: vector that added up at each element
    """
    return [sum(vec[0:i+1]) for i in range(0,len(vec))] 


    
    

In [9]:
#read training data
training_ori = pd.read_excel("./training_set_rel3.xlsx", sheetname="training_set", header=0)
training = pd.read_csv("./training_final.csv", sep=',',header=0, index_col=0)
training.head()

Unnamed: 0,essay_id,essay_set,essay,final_score
0,1,1,"Dear local newspaper, I think effects computer...",8.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",8.0


In [11]:
for i in range(1,9):
    print("{} essays in Topic {}.".format(training[training['essay_set']==i].shape[0], i))

1783 essays in Topic 1.
1800 essays in Topic 2.
1726 essays in Topic 3.
1772 essays in Topic 4.
1805 essays in Topic 5.
1800 essays in Topic 6.
1569 essays in Topic 7.
723 essays in Topic 8.


In [None]:
#generate feature vector for all essays
start = time.time()
training['f_vec'] = [_featureExtraction(essay) for essay in training['essay']]
print(time.time() - start)

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/twff/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [22]:
t1_training = training[training['essay_set']==1]
t2_training = training[training['essay_set']==2]
t3_training = training[training['essay_set']==3]
t4_training = training[training['essay_set']==4]
t5_training = training[training['essay_set']==5]
t6_training = training[training['essay_set']==6]
t7_training = training[training['essay_set']==7]
t8_training = training[training['essay_set']==8]

# 5-fold cross validation for Topic 1

In [23]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import cohen_kappa_score
from skll.metrics import kappa

In [24]:
X_train, X_test, y_train, y_test = train_test_split(list(t1_training['f_vec']),list(t1_training['final_score']), test_size=0.2)
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = [round(y) for y in y_pred]
kappa(y_test, y_pred, weights='quadratic', allow_off_by_one=False)



0.81336387331174476

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics.scorer import make_scorer

In [32]:
cv=5

In [40]:
scoring = make_scorer(kappa, weights='quadratic', allow_off_by_one=False)
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVR(C=1))
cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

array([ 0.83156453,  0.75660678,  0.84354651,  0.79866334,  0.82395923])

In [None]:
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVR(C=1))
cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

In [42]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
clf = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(max_depth=2, random_state=0))
cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

array([ 0.79873638,  0.71606179,  0.75596735,  0.73723823,  0.78399106])

In [44]:
from sklearn.ensemble import AdaBoostRegressor

In [45]:
clf = make_pipeline(preprocessing.StandardScaler(), AdaBoostRegressor())
cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

array([ 0.8278806 ,  0.75744681,  0.80544973,  0.815279  ,  0.79057534])

In [46]:
from sklearn.neural_network import MLPRegressor

In [47]:
clf = make_pipeline(preprocessing.StandardScaler(), MLPRegressor())
cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

array([ 0.83316016,  0.75574183,  0.8036342 ,  0.79862869,  0.80290169])