In [29]:
import numpy as np
import json
import pandas as pd
import tensorflow
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from collections import Counter
from pipe import transform_text_func
from scipy.sparse import hstack
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.preprocessing import StandardScaler
from nltk.stem import PorterStemmer
from scipy.sparse import csr_matrix
import logging
from gensim import corpora,models,similarities
import gensim

In [2]:
print('Read Data......')
s=time.time()
with open("review.json", encoding="utf8") as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews]
x = [reviews[i]['text'] for i in range(len(reviews))]
y = [reviews[i]['stars'] for i in range(len(reviews))]
del reviews
y = [i-1 for i in y]
print('total sample number: ',len(x))
print('time elapsed: ', time.time()-s)

-------read data-------
total sample number:  5261669
time elapsed:  225.13795733451843


In [5]:
n_sample = 100000
x_total = x[0:n_sample]
y_total = y[0:n_sample]

In [None]:
def upper_prob(data):
    uppercase = []
    total = len(data)
    step = 0
    for i in data:
        length = len(i.split())
        tmp = []
        for j in i:
            if j.isupper():
                tmp.append(j)
        uppercase.append(len(tmp)/length)
        bar.drawProgressBar(step/total)
        step +=1
    return(uppercase)

def scale(data):
    data = np.array(data).reshape(-1,1)
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    return(data)

def tokenize_stop(text):
    text = text.replace('.',' ')
    text = text.replace(',',' ')
    text = text.split()
    return(text)

def stop_and_max_feature(data, top_frequent_num, word_least_frequency_num,scale_v,stop_var):
    x_lower = [sublist.lower() for sublist in data]
    x_lower = [tokenize_stop(i) for i in x_lower]
    x_unlist = []
    for i in x_lower:
        x_unlist += i
    vocab_dic = Counter(x_unlist)
    stopwords_num = top_frequent_num

    print('    total vocab: ',len(vocab_dic.most_common()))
    maxfeature = len([i[0] for i in vocab_dic.most_common() if i[1]>word_least_frequency_num])
    print('    vocab size frequency >', word_least_frequency_num, ': ', maxfeature)

    stop = [i[0] for i in vocab_dic.most_common(stopwords_num)]

    x_n_level = [list(compress(x_lower, list(np.array(y_total)==i))) for i in np.unique(y_total)]

    x_n_level_unlist = [[] for i in np.unique(y_total)]
    for i in range(len(x_n_level)):
        for j in x_n_level[i]:
            x_n_level_unlist[i] += j
        
    multilevel_vocab = []
    for i in range(np.unique(y_total)):
        multilevel_vocab.append(Counter(x_n_level_unlist[i]))

    multilevel_stop = defaultdict(list)
    for i in range(np.unique(y_total)):
        tt = len(x_n_level_unlist[i])
        for j in stop:
            multilevel_stop[j].append(multilevel_vocab[i][j]/tt)

    stop_var = [(key,np.std(value)*scale_v) for key,value in multilevel_stop.items() ]
    stop = [i[0] for i in stop_var if i[1]<stop_var]
    return(stop)

def tokenize(text):
    try:
        punctuation = string.punctuation.replace('!','').replace('?','')
        regex = re.compile('[' +re.escape(punctuation) +']')
        text = regex.sub(" ", text) # remove punctuation
        ps = PorterStemmer()
        tokens = []
        tokens_ = [s.split() for s in sent_tokenize(text)]
        for token_by_sent in tokens_:
            tokens += token_by_sent   
        filtered_tokens = [ps.stem(w.lower()) for w in tokens]
        return filtered_tokens
    except TypeError as e: print(text,e)

# Preprocessing

## choose specific stop words

In [6]:
print('-------choose specific stop words-------')
s=time.time()
x_lower = [sublist.lower() for sublist in x_total]
punctuation = string.punctuation.replace('!','').replace('?','').replace("'",'').replace('~','')
regex = re.compile('[' +re.escape(punctuation) +']')
def tokenize(text):
    text = regex.sub(" ", text) # remove punctuation
    text = text.split()
    return(text)
x_lower = [tokenize(i) for i in x_lower]
x_unlist = []
for i in x_lower:
    x_unlist += i
vocab_dic = Counter(x_unlist)
stopwords_num = 250
# print([i[0] for i in vocab_dic.most_common(stopwords_num)],'\n\n')
print('total vocab: ',len(vocab_dic.most_common()))
maxfeature = len([i[0] for i in vocab_dic.most_common() if i[1]>1])
print('vocab size frequency > 1: ', maxfeature)
print('time elapsed: ', time.time()-s)

-------choose specific stop words-------
['the', 'and', 'i', 'a', 'to', 'was', 'of', 'it', 'is', 'for', 'in', 'my', 'that', 'with', 'this', 'they', 'but', 'we', 'you', 'on', 'have', 'not', 'had', 'so', 'were', 'at', 'are', 'food', 'good', 'place', 'be', 'as', 'very', 'me', 'great', 'there', 'all', 'if', 'out', 'like', 'just', 'here', 'service', 'our', 'time', 'get', 'one', 'their', 'from', 'when', 'would', 'or', 'up', "it's", 'back', 'an', 'go', 'about', 'will', 'really', 'he', 'what', 'which', 'she', 'been', 'no', 'your', 'some', 'also', 'only', 'can', 'more', 'them', 'us', 'by', 'because', 'other', 'nice', 'got', "don't", 'even', 'do', 'after', 'well', "i'm", 'has', "i've", 'best', 'always', 'too', 'ordered', 'love', 'did', 'than', 'came', 'staff', "didn't", 'first', 'order', 'little', 'never', 'went', 'friendly', 'try', 'definitely', 'restaurant', 'much', 'come', 'people', 'could', 'chicken', 'her', 'over', 'then', 'pretty', 'made', 'make', 'again', '2', 'how', 'experience', '5', 'm

### choose stop words via variance

In [7]:
stop = [i[0] for i in vocab_dic.most_common(stopwords_num)]
from itertools import compress
x_5level = [list(compress(x_lower, list(np.array(y)==i))) for i in np.unique(y)]
x_5level_unlist = [[],[],[],[],[]]
for i in range(len(x_5level)):
    for j in x_5level[i]:
        x_5level_unlist[i] += j
multilevel_vocab = []
for i in range(5):
    multilevel_vocab.append(Counter(x_5level_unlist[i]))


from collections import defaultdict
multilevel_stop = defaultdict(list)
for i in range(5):
    tt = len(x_5level_unlist[i])
    for j in stop:
        multilevel_stop[j].append(multilevel_vocab[i][j]/tt)

stop_var = [(key,np.std(value)*1000) for key,value in multilevel_stop.items() ]
stop = [i[0] for i in stop_var if i[1]<0.1]
# import pickle
# pickle.dump(multilevel_stop,open('multilevel_stop','wb'))

In [19]:
print('-------transform to features-------')
s=time.time()



review_pipeline = transform_text_func(method='tfidf', ngram = 3, max_f = maxfeature, 
                                      binary = True, stopwords=stop,token=tokenize,analyzer ='word')
X = review_pipeline.fit_transform(x_total)
print('time elapsed: ', time.time()-s)

-------transform to features-------
time elapsed:  210.51831817626953


## Add length as new feature

In [20]:
print('-------add length to features-------')
s = time.time()
length = np.array([(X[i,]!=0).sum() for i in range(X.shape[0])]).reshape(-1,1)
scaler = StandardScaler()
new_l = scaler.fit_transform(length)
X = hstack([X,new_l])
print('X shape: ',X.shape)
print('time elapsed: ', time.time()-s)

-------add length to features-------




X shape:  (100000, 58844)
time elapsed:  16.90064287185669


## Mutual Information feature selection

In [21]:
# print('-------mutual information feature selection-------')
# s = time.time()
# from sklearn.feature_selection import f_regression,mutual_info_classif
# from sklearn.feature_selection import SelectKBest
# print('Original data size: ',X.shape)
# X_new = SelectKBest(mutual_info_classif, k=30000).fit_transform(X, y_total)
# print('new data size: ',X_new.shape)
# print('time elapsed: ', time.time()-s)

## Add LDA

In [None]:
# print('-------mutual information feature selection-------')
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# dictionary = corpora.Dictionary(x_lower)
# corpus = [dictionary.doc2bow(text) for text in x_lower]
# model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)


# def bow_to_matrix(bow,lda_model):

#     result = np.zeros((len(bow),lda_model.get_topics().shape[0]),dtype=np.float64)

#     for i,each in enumerate(bow):

#         # each format as [(31,1),(38,1),(40,1)]

#         l = lda_model.get_document_topics(each) #l format as [(27, 0.20886971), (34, 0.17654318), (41, 0.50100213), (86, 0.060251668)]

#         a = [each[0] for each in l]

#         b = [each[1] for each in l]

#         result[i,a] = b     

#     result = csr_matrix(result)

#     return result
# youbian = bow_to_matrix(corpus,model)

2018-03-01 23:52:58,311 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-01 23:52:59,200 : INFO : adding document #10000 to Dictionary(38435 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:00,034 : INFO : adding document #20000 to Dictionary(53218 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:00,772 : INFO : adding document #30000 to Dictionary(65114 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:01,507 : INFO : adding document #40000 to Dictionary(74152 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:02,258 : INFO : adding document #50000 to Dictionary(82136 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:02,997 : INFO : adding document #60000 to Dictionary(89479 unique tokens: ['a', 'and', 'around', 'ask', "can't"]...)
2018-03-01 23:53:03,715 : INFO : adding document #70000 to Dictionary(96385 unique tokens: ['a', 'and', 'arou

2018-03-01 23:54:34,182 : INFO : topic #13 (0.010): 0.037*"and" + 0.034*"buffet" + 0.029*"the" + 0.025*"average" + 0.024*"for" + 0.023*"was" + 0.023*"food" + 0.021*"priced" + 0.020*"pasta" + 0.019*"a"
2018-03-01 23:54:34,182 : INFO : topic #5 (0.010): 0.041*"the" + 0.040*"and" + 0.039*"to" + 0.038*"i" + 0.021*"a" + 0.015*"he" + 0.015*"my" + 0.012*"they" + 0.012*"in" + 0.011*"of"
2018-03-01 23:54:34,182 : INFO : topic #41 (0.010): 0.076*"hamburger" + 0.059*"!!!" + 0.055*"chimi" + 0.054*"ya" + 0.051*"monte" + 0.042*"frites" + 0.012*"la" + 0.012*"de" + 0.012*"trop" + 0.011*"un"
2018-03-01 23:54:34,197 : INFO : topic #88 (0.010): 0.050*"and" + 0.042*"the" + 0.035*"a" + 0.031*"to" + 0.029*"is" + 0.025*"great" + 0.025*"you" + 0.019*"place" + 0.017*"are" + 0.016*"i"
2018-03-01 23:54:34,199 : INFO : topic #28 (0.010): 0.032*"a" + 0.031*"the" + 0.030*"and" + 0.022*"is" + 0.018*"of" + 0.018*"i" + 0.017*"to" + 0.014*"for" + 0.012*"in" + 0.012*"it"
2018-03-01 23:54:34,228 : INFO : topic diff=inf, 

2018-03-01 23:55:28,615 : INFO : topic #18 (0.010): 0.058*"the" + 0.045*"i" + 0.044*"was" + 0.031*"to" + 0.030*"it" + 0.030*"and" + 0.016*"food" + 0.015*"a" + 0.013*"not" + 0.012*"but"
2018-03-01 23:55:28,634 : INFO : topic diff=inf, rho=0.288675
2018-03-01 23:55:28,650 : INFO : PROGRESS: pass 0, at document #26000/100000
2018-03-01 23:55:34,656 : INFO : merging changes from 2000 documents into a model of 100000 documents
2018-03-01 23:55:35,895 : INFO : topic #71 (0.010): 0.229*"sushi" + 0.086*"drive" + 0.045*"thru" + 0.038*"chef" + 0.019*"service" + 0.019*"is" + 0.019*"the" + 0.018*"employees" + 0.017*"friendly!" + 0.016*"items"
2018-03-01 23:55:35,895 : INFO : topic #68 (0.010): 0.122*"station" + 0.055*"gas" + 0.055*"shake" + 0.040*"pastries" + 0.032*"smoothies" + 0.029*"chow" + 0.027*"mein" + 0.023*"wasting" + 0.023*"l" + 0.022*"convenience"
2018-03-01 23:55:35,899 : INFO : topic #29 (0.010): 0.075*"the" + 0.074*"was" + 0.047*"and" + 0.034*"i" + 0.032*"a" + 0.027*"it" + 0.020*"had"

2018-03-01 23:56:22,191 : INFO : topic #56 (0.010): 0.055*"i" + 0.039*"and" + 0.037*"the" + 0.034*"a" + 0.026*"to" + 0.025*"my" + 0.017*"was" + 0.016*"of" + 0.014*"in" + 0.012*"that"
2018-03-01 23:56:22,191 : INFO : topic #28 (0.010): 0.043*"games" + 0.037*"game" + 0.022*"a" + 0.019*"and" + 0.018*"the" + 0.017*"play" + 0.016*"d" + 0.014*"croissant" + 0.014*"sports" + 0.013*"try!"
2018-03-01 23:56:22,203 : INFO : topic #29 (0.010): 0.080*"was" + 0.078*"the" + 0.050*"and" + 0.036*"i" + 0.033*"a" + 0.029*"it" + 0.022*"had" + 0.015*"but" + 0.015*"very" + 0.015*"with"
2018-03-01 23:56:22,203 : INFO : topic #53 (0.010): 0.070*"the" + 0.044*"of" + 0.031*"a" + 0.029*"and" + 0.026*"to" + 0.023*"in" + 0.016*"that" + 0.014*"i" + 0.014*"is" + 0.012*"you"
2018-03-01 23:56:22,232 : INFO : topic diff=inf, rho=0.229416
2018-03-01 23:56:31,384 : INFO : -10.369 per-word bound, 1322.5 perplexity estimate based on a held-out corpus of 2000 documents with 197124 words
2018-03-01 23:56:31,384 : INFO : PROGR

2018-03-01 23:57:18,228 : INFO : topic #10 (0.010): 0.329*"car" + 0.086*"wash" + 0.031*"computer" + 0.020*"con" + 0.020*"washed" + 0.018*"detail" + 0.015*"dropping" + 0.015*"jason" + 0.014*"clinic" + 0.010*"people!"
2018-03-01 23:57:18,249 : INFO : topic diff=inf, rho=0.200000
2018-03-01 23:57:18,267 : INFO : PROGRESS: pass 0, at document #52000/100000
2018-03-01 23:57:24,676 : INFO : merging changes from 2000 documents into a model of 100000 documents
2018-03-01 23:57:25,932 : INFO : topic #0 (0.010): 0.096*"the" + 0.038*"and" + 0.029*"of" + 0.028*"a" + 0.026*"with" + 0.018*"was" + 0.017*"salad" + 0.015*"steak" + 0.013*"i" + 0.012*"for"
2018-03-01 23:57:25,932 : INFO : topic #11 (0.010): 0.391*"club" + 0.094*"grand" + 0.054*"switched" + 0.034*"wipe" + 0.029*"comedy" + 0.024*"hangover" + 0.022*"ears" + 0.021*"manger" + 0.015*"angela" + 0.011*"hairdressers"
2018-03-01 23:57:25,937 : INFO : topic #57 (0.010): 0.128*"turkey" + 0.090*"country" + 0.051*"jack" + 0.050*"boxes" + 0.040*"browni

2018-03-01 23:58:21,598 : INFO : topic #15 (0.010): 0.034*"az" + 0.029*"mark" + 0.026*"tom" + 0.022*"the" + 0.021*"ac" + 0.021*"worker" + 0.020*"hill" + 0.020*"budget" + 0.017*"rate" + 0.016*"missed"
2018-03-01 23:58:21,598 : INFO : topic #86 (0.010): 0.042*"a" + 0.034*"and" + 0.026*"the" + 0.025*"of" + 0.021*"wife" + 0.016*"they" + 0.016*"to" + 0.015*"is" + 0.013*"for" + 0.013*"in"
2018-03-01 23:58:21,612 : INFO : topic #10 (0.010): 0.376*"car" + 0.077*"wash" + 0.028*"computer" + 0.023*"clinic" + 0.021*"washed" + 0.020*"con" + 0.016*"jason" + 0.015*"people!" + 0.014*"dropping" + 0.014*"detail"
2018-03-01 23:58:21,612 : INFO : topic #31 (0.010): 0.289*"french" + 0.193*"toast" + 0.098*"yum" + 0.043*"soooo" + 0.043*"satisfy" + 0.034*"shirts" + 0.027*"denny's" + 0.018*"spoon" + 0.016*"beats" + 0.014*"closet"
2018-03-01 23:58:21,634 : INFO : topic diff=inf, rho=0.176777
2018-03-01 23:58:21,650 : INFO : PROGRESS: pass 0, at document #66000/100000
2018-03-01 23:58:27,981 : INFO : merging cha

2018-03-01 23:59:07,319 : INFO : topic diff=inf, rho=0.162221
2018-03-01 23:59:07,324 : INFO : PROGRESS: pass 0, at document #78000/100000
2018-03-01 23:59:14,050 : INFO : merging changes from 2000 documents into a model of 100000 documents
2018-03-01 23:59:15,267 : INFO : topic #24 (0.010): 0.044*"the" + 0.031*"and" + 0.027*"flavors" + 0.026*"sweet" + 0.020*"a" + 0.018*"their" + 0.017*"waffle" + 0.015*"waffles" + 0.014*"delicious" + 0.013*"strawberry"
2018-03-01 23:59:15,267 : INFO : topic #44 (0.010): 0.027*"the" + 0.027*"i" + 0.026*"a" + 0.020*"was" + 0.019*"and" + 0.019*"office" + 0.017*"to" + 0.015*"in" + 0.011*"apartment" + 0.010*"that"
2018-03-01 23:59:15,272 : INFO : topic #60 (0.010): 0.100*"the" + 0.036*"is" + 0.034*"in" + 0.032*"and" + 0.031*"a" + 0.031*"vegas" + 0.025*"to" + 0.022*"of" + 0.013*"are" + 0.013*"at"
2018-03-01 23:59:15,272 : INFO : topic #49 (0.010): 0.070*"gin" + 0.067*"attraction" + 0.050*"su" + 0.050*"charcuterie" + 0.011*"hangs" + 0.009*"nouveau" + 0.008*"k

In [None]:
# lda_matrix = scaler.fit_transform(youbian)
# X = hstack([X,lda_matirx])

## Add NOT_

# Model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y_total, test_size=0.33, random_state=109)

In [23]:
print('-------Ridge-------')
s = time.time()
# rkf = RepeatedKFold(n_splits=2, n_repeats=1)
# parameters = {'alpha':[5]}
# accu_score = make_scorer(accuracy_score)
# ridge_model = RidgeClassifier()
# ridge_cv = GridSearchCV(ridge_model, parameters,cv=rkf, pre_dispatch=4, return_train_score = True,scoring=accu_score)
# ridge_cv.fit(x_train, y_train)
# print(ridge_cv.cv_results_)
# ridge_train_res = ridge_cv.predict(x_train)
# ridge_test_res = ridge_cv.predict(x_test)
ridge_model = RidgeClassifier(alpha = 3)
ridge_model = ridge_model.fit(x_train, y_train)
ridge_train_res = ridge_model.predict(x_train)
ridge_test_res = ridge_model.predict(x_test)
print("train accuracy:", accuracy_score(ridge_train_res, y_train))
print("test accuracy:", accuracy_score(ridge_test_res, y_test))
print('time elapsed: ', time.time()-s)

-------Ridge-------
train accuracy: 0.8340149253731344
test accuracy: 0.6690909090909091
time elapsed:  31.54082465171814


In [None]:
import lightgbm as lgb
num_train, num_feature = x_train.shape

lgb_train = lgb.Dataset(x_train, y_train, free_raw_data=False)
lgb_test = lgb.Dataset(x_test, y_test, reference=lgb_train, free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'metric': 'multi_error',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'num_class': 5
}
print('Start training...')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_test,
               early_stopping_rounds=5,)



0.645636