In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import json
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize,sent_tokenize
import time
from collections import Counter
from pipe import transform_text_func,FeatureExtractor, ImputeNA, CategoricalEncoding,text
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, make_scorer,mean_squared_error
from sklearn.preprocessing import StandardScaler
from nltk.stem import PorterStemmer
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline, make_union 
from itertools import compress
from collections import defaultdict
import bar

In [2]:
def generate_interaction(feature_list):
    total = len(feature_list)*(len(feature_list)-1)/2
    step = 0
    for i,ai in enumerate(feature_list):
        for j,bj in enumerate(feature_list):
            if i<j:
                x = total_data[ai]
                y = total_data[bj]
                t = []
                for l in range(total_data.shape[0]):
                    t.append(str(x[l])+' '+ str(y[l]))
                total_data[ai+'_'+bj] = t
                step +=1
                bar.drawProgressBar(step/total)
                
                
def upper_prob(data):
    uppercase = []
    total = len(data)
    step = 0
    for i in data:
        length = len(i.split())
        tmp = []
        for j in i:
            if j.isupper():
                tmp.append(j)
        uppercase.append(len(tmp)/length)
    return(uppercase)

def scale(data):
    data = np.array(data).reshape(-1,1)
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    return(data)

def tokenize_stop(text):
    text = text.replace('.',' ')
    text = text.split()
    return(text)

def stop_and_max_feature(data, top_frequent_num, word_least_frequency_num,scale_v,stop_v):
    x_lower = [sublist.lower() for sublist in data]
    x_lower = [tokenize_stop(i) for i in x_lower]
    x_unlist = []
    for i in x_lower:
        x_unlist += i
    vocab_dic = Counter(x_unlist)
    stopwords_num = top_frequent_num

    print('      total vocab: ',len(vocab_dic.most_common()))
    maxfeature = len([i[0] for i in vocab_dic.most_common() if i[1]>word_least_frequency_num])
    print('      vocab size frequency >', word_least_frequency_num, ': ', maxfeature)

    stop = [i[0] for i in vocab_dic.most_common(stopwords_num)]

    x_n_level = [list(compress(x_lower, list(np.array(y_total)==i))) for i in np.unique(y_total)]

    x_n_level_unlist = [[] for i in range(len(np.unique(y_total)))]
    for i in range(len(x_n_level)):
        for j in x_n_level[i]:
            x_n_level_unlist[i] += j
        
    multilevel_vocab = []
    for i in range(len(np.unique(y_total))):
        multilevel_vocab.append(Counter(x_n_level_unlist[i]))

    multilevel_stop = defaultdict(list)
    for i in range(len(np.unique(y_total))):
        tt = len(x_n_level_unlist[i])
        for j in stop:
            multilevel_stop[j].append(multilevel_vocab[i][j]/tt)

    stop_var = [(key,np.std(value)*scale_v) for key,value in multilevel_stop.items() ]
    stop = [i[0] for i in stop_var if i[1]<stop_v]
    return(stop,maxfeature)

def tokenize(text):
    try:
        punctuation = string.punctuation.replace('#','')
        regex = re.compile('[' +re.escape(punctuation) +']')
        text = regex.sub(" ", text) # remove punctuation
        text = text.replace('#1','')
        text = text.replace('#2','')
        text = text.replace('#3','')
        text = text.replace('#4','')
        text = text.replace('#5','')
        text = text.replace('#6','')
        text = text.replace('#7','')
        text = text.replace('#8','')
        text = text.replace('#9','')
        ps = PorterStemmer()
        tokens = []
        tokens_ = [s.split() for s in sent_tokenize(text)]
        for token_by_sent in tokens_:
            tokens += token_by_sent   
        filtered_tokens = [ps.stem(w.lower()) for w in tokens]
        return filtered_tokens
    except TypeError as e: print(text,e)

In [11]:
print('Read Data......')
train = pd.read_csv('training_data.csv',header= 0 ,delimiter='\t|\n')
test = pd.read_csv('test_data.csv',header= 0 ,delimiter='\t|\n')
total_data = pd.concat([train,test],ignore_index=True)

print('Generate Interaction Features Between Categorical Features......')
s = time.time()
cate_list = ['age_cat','sex','stay_cat','lang','er','category']
generate_interaction(cate_list)            
print('\ntime elapsed: ', time.time()-s,'\n')

x_total = list(total_data.comment)
y_total = list(train.score)

print('Calculate Uppercase Probability to Features......')
s = time.time()
upper_p = upper_prob(x_total)
new_up = scale(upper_p)
print('time elapsed: ', time.time()-s,'\n')

print('Choose Specific Stop Words and Max Text Feature Number......')
s=time.time() 
stop, maxfeature = stop_and_max_feature(x_total,250,0,1000,0.1)
print('time elapsed: ', time.time()-s,'\n')

# Cate pipeline
onehot_list = ['age_cat', 'sex', 'stay_cat', 'lang', 'er','age_cat_sex', 'age_cat_stay_cat',
       'age_cat_lang', 'age_cat_er', 'age_cat_category', 'sex_stay_cat',
       'sex_lang', 'sex_er', 'sex_category', 'stay_cat_lang', 'stay_cat_er',
       'stay_cat_category', 'lang_er', 'lang_category', 'er_category']
onehot_pipeline = make_pipeline(FeatureExtractor(onehot_list),
                                CategoricalEncoding('OneHot'),
                                )

print('-------Transform to Features(word tf-idf level)-------')
s=time.time()
comment_word_tfidf_pipeline = make_pipeline(FeatureExtractor('comment'),
                                text(method='tfidf', ngram = 3, max_f = maxfeature, 
                                     binary = False, stopwords=stop,tokenizer=tokenize,analyzer ='word'))

feature_union_word_tfidf = make_union(
    onehot_pipeline,
    comment_word_tfidf_pipeline
)
X_word_tfidf = feature_union_word_tfidf.fit_transform(total_data)
print('time elapsed: ', time.time()-s)

print('      Add Length and Upper Prob to Features......')
s = time.time()
length = [(X_word_tfidf[i,]!=0).sum() for i in range(X_word_tfidf.shape[0])]
new_l = scale(length)
X_word_tfidf = hstack([X_word_tfidf,new_l],format='csr')
X_word_tfidf = hstack([X_word_tfidf,new_up],format='csr')

print('      X_word_tfidf shape: ',X_word_tfidf.shape)
print('      time elapsed: ', time.time()-s)
X_word_tfidf_train = X_word_tfidf[0:train.shape[0],]
X_word_tfidf_test = X_word_tfidf[train.shape[0]:X_word_tfidf.shape[0],]

print('-------Word TF-IDF Data Complete-------')

print('\n-------Ridge Model-------')
s = time.time()
ridge_model_word_tfidf = Ridge(alpha = 1.5)
ridge_model_word_tfidf = ridge_model_word_tfidf.fit(X_word_tfidf_train, y_total)


ridge_train_word_tfidf_res = ridge_model_word_tfidf.predict(X_word_tfidf_train)
ridge_train_word_tfidf_res = [10 if i >10 else round(i) for i in ridge_train_word_tfidf_res]
ridge_train_word_tfidf_res = np.array([0 if i<0 else round(i) for i in ridge_train_word_tfidf_res])

print("      train accuracy:", mean_squared_error(ridge_train_word_tfidf_res, y_total))
print('      time elapsed: ', time.time()-s)

ridge_test_word_tfidf_res = ridge_model_word_tfidf.predict(X_word_tfidf_test)
print('-------Word TF-IDF Ridge Model Complete-------')


print('\n-------Transform to Features(char level)-------')
comment_char_pipeline = make_pipeline(FeatureExtractor('comment'),
                                text(method='tfidf', ngram = 3, max_f = maxfeature, 
                                     binary = False, stopwords=stop,tokenizer=tokenize,analyzer ='char'))

feature_union_char = make_union(
    onehot_pipeline,
    comment_char_pipeline
)
X_char = feature_union_char.fit_transform(total_data)
print('time elapsed: ', time.time()-s)

print('      Add Length and Upper Prob to Features......')
s = time.time()
length = [(X_char[i,]!=0).sum() for i in range(X_char.shape[0])]
new_l = scale(length)
X_char = hstack([X_char,new_l],format='csr')
X_char = hstack([X_char,new_up],format='csr')

print('      X_char shape: ',X_char.shape)
print('      time elapsed: ', time.time()-s)

X_char_train = X_char[0:train.shape[0],]
X_char_test = X_char[train.shape[0]:X_char.shape[0],]

print('-------Character Data Complete-------')

print('\n-------Ridge Model-------')
s = time.time()
ridge_char_model = Ridge(alpha = 1.5)
ridge_char_model = ridge_char_model.fit(X_char_train, y_total)


ridge_char_train_res = ridge_char_model.predict(X_char_train)
ridge_char_train_res = [10 if i >10 else round(i) for i in ridge_char_train_res]
ridge_char_train_res = np.array([0 if i<0 else round(i) for i in ridge_char_train_res])

print("      train accuracy:", mean_squared_error(ridge_char_train_res, y_total))
print('      time elapsed: ', time.time()-s)

ridge_test_char_res = ridge_char_model.predict(X_char_test)
print('-------Character Ridge Model Complete-------')


print('\n-------Transform to Features(count level)-------')
comment_word_cv_pipeline = make_pipeline(FeatureExtractor('comment'),
                                text(method='cv', ngram = 3, max_f = maxfeature, 
                                     binary = False, stopwords=stop,tokenizer=tokenize,analyzer ='word'))

feature_union_word_cv = make_union(
    onehot_pipeline,
    comment_word_cv_pipeline
)
X_word_cv = feature_union_word_cv.fit_transform(total_data)
print('time elapsed: ', time.time()-s)

print('      Add Length and Upper Prob to Features......')
s = time.time()
length = [(X_word_cv[i,]!=0).sum() for i in range(X_word_cv.shape[0])]
new_l = scale(length)
X_word_cv = hstack([X_word_cv,new_l],format='csr')
X_word_cv = hstack([X_word_cv,new_up],format='csr')

print('      X_word_cv shape: ',X_word_cv.shape)
print('      time elapsed: ', time.time()-s)


X_word_cv_train = X_word_cv[0:train.shape[0],]
X_word_cv_test = X_word_cv[train.shape[0]:X_word_cv.shape[0],]

print('-------Word Count Vectorize Data Complete-------')

print('\n-------Ridge Model-------')
s = time.time()
ridge_word_cv_model = Ridge(alpha = 1.5)
ridge_word_cv_model = ridge_word_cv_model.fit(X_word_cv_train, y_total)


ridge_word_cv_train_res = ridge_word_cv_model.predict(X_word_cv_train)
ridge_word_cv_train_res = [10 if i >10 else round(i) for i in ridge_word_cv_train_res]
ridge_word_cv_train_res = np.array([0 if i<0 else round(i) for i in ridge_word_cv_train_res])

print("      train accuracy:", mean_squared_error(ridge_word_cv_train_res, y_total))
print('      time elapsed: ', time.time()-s)

ridge_word_cv_test_res = ridge_word_cv_model.predict(X_word_cv_test)
print('-------Word Count Vectorize Ridge Model Complete-------')

print('\n-------Ensemble 3 Results-------')
result = 0.15*ridge_test_char_res+0.8*ridge_test_word_tfidf_res+0.05*ridge_word_cv_test_res
result = [10 if i >10 else round(i) for i in result]
result = [0 if i<0 else round(i) for i in result]
# print("test accuracy:", mean_squared_error(result, y_test))

Read Data......
Generate Interaction Features Between Categorical Features......
time elapsed:  50.74075436592102 

Calculate Uppercase Probability to Features......
time elapsed:  1.1400959491729736 

Choose Specific Stop Words and Max Text Feature Number......
      total vocab:  49793
      vocab size frequency > 0 :  49793
time elapsed:  1.766697645187378 

-------Transform to Features(word tf-idf level)-------
time elapsed:  73.19932651519775
      Add Length and Upper Prob to Features......
      X_word_tfidf shape:  (178778, 50239)
      time elapsed:  30.5923912525177
-------Word TF-IDF Data Complete-------

-------Ridge Model-------
      train accuracy: 2.66386078617
      time elapsed:  34.059736490249634
-------Word TF-IDF Ridge Model Complete-------

-------Transform to Features(char level)-------
time elapsed:  58.46917462348938
      Add Length and Upper Prob to Features......
      X_char shape:  (178778, 22258)
      time elapsed:  31.602362632751465
-------Character D