In [1]:
from utlis import *
import numpy as np
import pandas as pd
import time
import nltk
from collections import Counter
from nltk import word_tokenize, pos_tag
import string
import enchant
from enchant.checker import SpellChecker
import re

In [2]:

def _cleanText(t):
    '''
    t string, raw text input
    ret t string, a list of words
    '''
    t = t.lower()
    t = re.sub(r'[^\w\s]','',t)
    t = re.sub(r'\s*(\(\d)|(\))\s*', '', t)
    #t = t.split()
    return t

def _nltktag(text):
    """
    Using nltk.word_tokenize to tag words as 'NN', 'DT'
    for extracting noun, verb, adj
    """
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    return tagged_words

def _wordCount(text):
    """
    input: string 
    output: int -- Count of words
    """
    return sum(Counter(text.split()).values())

def _longWordCount(text):
    """
    input: string
    output: int -- Count of Long words
    
    """
    #Average word length without stop words is 5.6
    ##threshold = 6
    long_words = [word for word in text.split() if len(word)>6]
    return sum(Counter(long_words).values())

def _partOfSpeechCount(text):
    
    tagged_words = _nltktag(text)
    #Noun Count
    listnn = [w[0] for w in tagged_words if w[1] in ['NN', 'NNP', 'NNPS','NNS']]
    nnCount = sum(Counter(listnn).values())
    #Verb Count
    listvb = [w[0] for w in tagged_words if w[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    verbCount = sum(Counter(listvb).values())
    #Adjective Count
    listadj = [w[0] for w in tagged_words if w[1] in ['JJ', 'JJR', 'JJS']]
    adjCount = sum(Counter(listadj).values())
    #Adverb Count
    listadvb = [w[0] for w in tagged_words if w[1] in ['RR', 'RBR', 'RBS']]
    advbCount = sum(Counter(listadvb).values())
    return nnCount, verbCount, adjCount, advbCount

def _commaCount(text):
    return text.count(',')

def _punctuationCount(text):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(text,set(string.punctuation)) 

def _sentenceCount(text):
    return len(nltk.sent_tokenize(text))

def _wordLengthAvg(text):
    l = text.split()
    return sum(map(len, l))/float(len(l))



def _spellingError(text):
    """
    return: Count of misspelled words
    """
    my_dict = enchant.Dict("en_US")
    my_checker = SpellChecker(my_dict)
    my_checker.set_text(text)
    return len([error.word for error in my_checker])

def _lexicalDiversity(t):
    """
    t input seq, String
    ---------
    return float ratio
    """
    return len(set(t)) / len(t)

def _quotationMark(t):
    '''
    t string, raw input
    ret li, ceil of pairs of quatation contained in input text
    '''
    li = re.findall('"',t)
    n = len(li)
    n = int(np.ceil(n/2))
    return n
    
def _exclamationMarks(text):
    return text.count('!')

def _featureExtraction(text):
    """
    input: essay as a long string
    
    output:feature vector
    elements in output: 
    1. word count 
    2. long word count
    3. noun word count
    4. verb count
    5. comma count
    6. punctuation count
    7. sentence count
    8. adjective count
    9. adverb count
    10. lexical diversity
    11. quatation mark
    12. word length
    13. spelling error
    14*.bracket count
    15*.exclamation count
    16*. Foreign words count
    """
    wordCount = _wordCount(text)
    longWordCount = _longWordCount(text)
    nounCount, verbCount, adjCount, advbCount = _partOfSpeechCount(text)
    commaCount = _commaCount(text)
    puncCount = _punctuationCount(text)
    sentCount = _sentenceCount(text)
    lexDiv = _lexicalDiversity(text)
    quatMarkCount = _quotationMark(text)
    avgWordLen = _wordLengthAvg(text)
    spelErrorCount = _spellingError(text)
    #brcktCount = _br
    exclamationCount = _exclamationMarks(text)
    
    #f = np.array([wordCount, longWordCount, nounCount, verbCount, commaCount, puncCount, sentCount, 
    #             adjCount, advbCount, lexDiv, quatMarkCount, avgWordLen, spelErrorCount])
    f = [wordCount, longWordCount, nounCount, verbCount, commaCount, puncCount, sentCount, 
                 adjCount, advbCount, lexDiv, quatMarkCount, avgWordLen, spelErrorCount]
    #f_res = _addedByStep(f)
    
    return f#_res #feature vector

def _addedByStep(vec):
    """
    input: vec
    output: vector that added up at each element
    """
    return [sum(vec[0:i+1]) for i in range(0,len(vec))] 


    
    

In [3]:
#read training data
training_ori = pd.read_excel("./training_set_rel3.xlsx", sheetname="training_set", header=0)
training = pd.read_csv("./training_final.csv", sep=',',header=0, index_col=0)
training.head()

Unnamed: 0,essay_id,essay_set,essay,final_score
0,1,1,"Dear local newspaper, I think effects computer...",8.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",8.0


In [4]:
for i in range(1,9):
    print("{} essays in Topic {}.".format(training[training['essay_set']==i].shape[0], i))

1783 essays in Topic 1.
1800 essays in Topic 2.
1726 essays in Topic 3.
1771 essays in Topic 4.
1805 essays in Topic 5.
1800 essays in Topic 6.
1569 essays in Topic 7.
723 essays in Topic 8.


## Possibly should not use f_res = _addedByStep(f)
* (1)+(2) means use both features, not add them up, from my understanding.

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liuzhaopeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liuzhaopeng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
#generate feature vector for all essays
start = time.time()
training['f_vec'] = [_featureExtraction(essay) for essay in training['essay']]
print(time.time() - start)

596.1736183166504


In [7]:
t1_training = training[training['essay_set']==1]
t2_training = training[training['essay_set']==2]
t3_training = training[training['essay_set']==3]
t4_training = training[training['essay_set']==4]
t5_training = training[training['essay_set']==5]
t6_training = training[training['essay_set']==6]
t7_training = training[training['essay_set']==7]
t8_training = training[training['essay_set']==8]

# 5-fold cross validation

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import cohen_kappa_score
from skll.metrics import kappa
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics.scorer import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

In [9]:
def fit_predict(data_x, data_y, model):
    # data is x
    X_train, X_test, y_train, y_test = train_test_split(list(data_x),list(data_y), test_size=0.2)
    
    # define score function
    scoring = make_scorer(kappa, weights='quadratic', allow_off_by_one=False)
    cv=5
    
    # linear model
    if model == 'lr':
        clf = make_pipeline(preprocessing.StandardScaler(), linear_model.LinearRegression())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

    if model == 'svm':
        clf = make_pipeline(preprocessing.StandardScaler(), svm.SVR(C=1))
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
    
    if model == 'rf':
        clf = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(max_depth=2, random_state=0))
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
        
    if model == 'adaboost':
        clf = make_pipeline(preprocessing.StandardScaler(), AdaBoostRegressor())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)
        
    if model == 'mlp':
        clf = make_pipeline(preprocessing.StandardScaler(), MLPRegressor())
        return cross_val_score(clf, X_train, y_train, cv=cv,scoring = scoring)

In [10]:
for i,dataset in enumerate([t1_training,t2_training,t3_training,t4_training,t5_training,t6_training,t7_training,t8_training]):
    print('Dataset: t{}_training'.format(i))
    for model in ['lr']:
        #print('Model: {}'.format(model))
        print('result: {}'.format(fit_predict(dataset.f_vec, dataset.final_score,model)))

Dataset: t0_training
result: [ 0.79410329  0.84519962  0.82203126  0.80080995  0.82335623]
Dataset: t1_training
result: [ 0.71895482  0.77254924  0.70834801  0.67786578  0.71294489]
Dataset: t2_training
result: [ 0.58356546  0.68027361  0.64412417  0.67737982  0.62882034]
Dataset: t3_training
result: [ 0.63748915  0.67241363  0.67419737  0.65802102  0.61670004]
Dataset: t4_training
result: [ 0.78120203  0.79508202  0.82523605  0.76724398  0.78259676]
Dataset: t5_training
result: [ 0.69343692  0.64370546  0.63628664  0.61796747  0.68322211]
Dataset: t6_training
result: [ 0.67493395  0.71455707  0.62422377  0.65734591  0.68928801]
Dataset: t7_training
result: [ 0.7034551   0.65297574  0.50187859  0.66882304  0.57493464]


In [11]:
dataset = t1_training
selected = [] # from 0 to 12
temp = []
max_result = 0
while(1):
    add_feature = None
    for i in range(13):
        temp_selected = selected.copy()

        if i not in selected:
            temp_selected.append(i)
            print('searching range: ', temp_selected)

        data_x = dataset.f_vec.apply(lambda x: [x[i] for i in temp_selected])
        data_y = dataset.final_score
        temp_result = np.mean(fit_predict(data_x, data_y, model = 'lr'))

        if temp_result>max_result:
            add_feature = i
            max_result = temp_result

    if add_feature != None:
        selected.append(add_feature)
    else:
        break
    print('add_feature: ',add_feature, max_result)
    print('*'*60)

searching range:  [0]
searching range:  [1]
searching range:  [2]
searching range:  [3]
searching range:  [4]
searching range:  [5]
searching range:  [6]
searching range:  [7]
searching range:  [8]
searching range:  [9]
searching range:  [10]
searching range:  [11]
searching range:  [12]
add_feature:  0 0.754501171258
************************************************************
searching range:  [0, 1]
searching range:  [0, 2]
searching range:  [0, 3]
searching range:  [0, 4]
searching range:  [0, 5]
searching range:  [0, 6]
searching range:  [0, 7]
searching range:  [0, 8]
searching range:  [0, 9]
searching range:  [0, 10]
searching range:  [0, 11]
searching range:  [0, 12]
add_feature:  1 0.80253224446
************************************************************
searching range:  [0, 1, 2]
searching range:  [0, 1, 3]
searching range:  [0, 1, 4]
searching range:  [0, 1, 5]
searching range:  [0, 1, 6]
searching range:  [0, 1, 7]
searching range:  [0, 1, 8]
searching range:  [0, 1, 9]
s

In [12]:

def forward_selection(dataset):
    # selected features
    selected = [] # from 0 to 12
    # max kappa value
    max_result = 0
    while(1):
        # features we will add 
        add_feature = None
        for i in range(13):
            temp_selected = selected.copy()
            
            # only select features not selected
            if i not in selected:
                temp_selected.append(i)
                print('searching range: ', temp_selected)
            
            # calculate kappa for current feature set
            data_x = dataset.f_vec.apply(lambda x: [x[i] for i in temp_selected])
            data_y = dataset.final_score
            temp_result = np.mean(fit_predict(data_x, data_y, model = 'lr'))
            
            # get better result, update
            if temp_result>max_result:
                add_feature = i
                max_result = temp_result
        
        if add_feature != None:
            selected.append(add_feature)
        # 
        else:
            break
        print('temp_result: ', temp_result)
        print('add_feature: ',add_feature, max_result)
        print('*'*60)
    return selected, max_result

In [13]:
result_dict = {}
kappa_dict = {}
for i,dataset in enumerate([t1_training,t2_training,t3_training,t4_training,t5_training,t6_training,t7_training,t8_training]):
    print('Dataset: t{}_training'.format(i))
    a,b = forward_selection(dataset)
    result_dict['Dataset: t{}_training'.format(i)] = a
    kappa_dict['Dataset: t{}_training'.format(i)] = b

Dataset: t0_training
searching range:  [0]
searching range:  [1]
searching range:  [2]
searching range:  [3]
searching range:  [4]
searching range:  [5]
searching range:  [6]
searching range:  [7]
searching range:  [8]
searching range:  [9]
searching range:  [10]
searching range:  [11]
searching range:  [12]
temp_result:  0.11736513318
add_feature:  1 0.762907433649
************************************************************
searching range:  [1, 0]
searching range:  [1, 2]
searching range:  [1, 3]
searching range:  [1, 4]
searching range:  [1, 5]
searching range:  [1, 6]
searching range:  [1, 7]
searching range:  [1, 8]
searching range:  [1, 9]
searching range:  [1, 10]
searching range:  [1, 11]
searching range:  [1, 12]
temp_result:  0.753856092116
add_feature:  0 0.791724096985
************************************************************
searching range:  [1, 0, 2]
searching range:  [1, 0, 3]
searching range:  [1, 0, 4]
searching range:  [1, 0, 5]
searching range:  [1, 0, 6]
search

searching range:  [0, 1, 6, 3]
searching range:  [0, 1, 6, 4]
searching range:  [0, 1, 6, 5]
searching range:  [0, 1, 6, 7]
searching range:  [0, 1, 6, 8]
searching range:  [0, 1, 6, 9]
searching range:  [0, 1, 6, 10]
searching range:  [0, 1, 6, 11]
searching range:  [0, 1, 6, 12]
Dataset: t3_training
searching range:  [0]
searching range:  [1]
searching range:  [2]
searching range:  [3]
searching range:  [4]
searching range:  [5]
searching range:  [6]
searching range:  [7]
searching range:  [8]
searching range:  [9]
searching range:  [10]
searching range:  [11]
searching range:  [12]
temp_result:  0.299274254732
add_feature:  0 0.634411640489
************************************************************
searching range:  [0, 1]
searching range:  [0, 2]
searching range:  [0, 3]
searching range:  [0, 4]
searching range:  [0, 5]
searching range:  [0, 6]
searching range:  [0, 7]
searching range:  [0, 8]
searching range:  [0, 9]
searching range:  [0, 10]
searching range:  [0, 11]
searching 

searching range:  [0, 6, 9, 7, 8]
searching range:  [0, 6, 9, 7, 10]
searching range:  [0, 6, 9, 7, 11]
searching range:  [0, 6, 9, 7, 12]
temp_result:  0.658476476647
add_feature:  10 0.676268146067
************************************************************
searching range:  [0, 6, 9, 7, 10, 1]
searching range:  [0, 6, 9, 7, 10, 2]
searching range:  [0, 6, 9, 7, 10, 3]
searching range:  [0, 6, 9, 7, 10, 4]
searching range:  [0, 6, 9, 7, 10, 5]
searching range:  [0, 6, 9, 7, 10, 8]
searching range:  [0, 6, 9, 7, 10, 11]
searching range:  [0, 6, 9, 7, 10, 12]
Dataset: t7_training
searching range:  [0]
searching range:  [1]
searching range:  [2]
searching range:  [3]
searching range:  [4]
searching range:  [5]
searching range:  [6]
searching range:  [7]
searching range:  [8]
searching range:  [9]
searching range:  [10]
searching range:  [11]
searching range:  [12]
temp_result:  0.0844031750239
add_feature:  1 0.559813669603
************************************************************
s

In [14]:
result_dict

{'Dataset: t0_training': [1, 0, 9, 12, 4, 3, 11, 1, 4],
 'Dataset: t1_training': [1, 9, 12, 6, 8, 3],
 'Dataset: t2_training': [0, 1, 6],
 'Dataset: t3_training': [0, 8, 10, 5, 1],
 'Dataset: t4_training': [0, 11, 4],
 'Dataset: t5_training': [0, 6, 7, 6],
 'Dataset: t6_training': [0, 6, 9, 7, 10],
 'Dataset: t7_training': [1, 5, 4, 6, 9, 12]}

In [15]:
kappa_dict

{'Dataset: t0_training': 0.82993675916491527,
 'Dataset: t1_training': 0.75521525430711067,
 'Dataset: t2_training': 0.65379128514785623,
 'Dataset: t3_training': 0.670339186482515,
 'Dataset: t4_training': 0.79477457062787582,
 'Dataset: t5_training': 0.66451563015261628,
 'Dataset: t6_training': 0.67626814606734897,
 'Dataset: t7_training': 0.66570534236835288}