In [101]:
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
import matplotlib.pylab as plt
import hashlib
import time
import pickle
import sys
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.metrics import f1_score

In [102]:
label_list = ['id',
 'content',
 'location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']
total_train = pd.read_csv('data/train_tag.csv')
total_valid = pd.read_csv('data/valid_tag.csv')
total_test = pd.read_csv('data/test_tag.csv')

In [110]:
print(total_test.head(1))

   id                                            content  \
0   0  " 我 想 说 他们 家 的 优惠活动 好 持久 啊 ， 我 预售 的 时候 买 的 券 ，...   

   location_traffic_convenience  location_distance_from_business_district  \
0                            -2                                        -2   

   location_easy_to_find  service_wait_time  service_waiters_attitude  \
0                     -2                 -2                         1   

   service_parking_convenience  service_serving_speed  price_level  \
0                           -2                     -2            0   

                ...                 environment_decoration  environment_noise  \
0               ...                                     -2                 -2   

   environment_space  environment_cleaness  dish_portion  dish_taste  \
0                 -2                     1             1           1   

   dish_look  dish_recommendation  others_overall_experience  \
0         -2                   -2                          1   


In [103]:
def train_binary(train_data, valid_data, label, global_tfidf):
    train_tfidf = global_tfidf.transform(train_data['content'])
    valid_tfidf = global_tfidf.transform(valid_data['content'])
    model = svm.LinearSVC(C=1.0)
    model.fit(train_tfidf, train_data[label])
    preds = model.predict(valid_tfidf)
    return model, preds

In [104]:
def train_sentiment(train_data, valid_data, label, global_tfidf):
    train_tfidf = global_tfidf.transform(train_data['content'])
    valid_tfidf = global_tfidf.transform(valid_data['content'])
    model = svm.LinearSVC(C=1.0)
    model.fit(train_tfidf, train_data[label])
    preds = model.predict(valid_tfidf)
    return model, preds

In [105]:
def predict_true_validy(pred_binary, pred_sentiment, sentiment_model, true_valid, global_tfidf):
    true_valid_tfidf = global_tfidf.transform(true_valid['content'])
    true_preds = sentiment_model.predict(true_valid_tfidf)
    for i,_binary in enumerate(binary_preds):
        true_preds[i] = -2 if _binary==0 else true_preds[i]
    return true_preds

In [106]:
def valid_score(binary_preds, sentiment_preds, true_preds, binary_validy, sentiment_validy, true_validy, method='macro'):
    binary_f1 = f1_score(binary_validy, binary_preds, average=method)
    print('Binary f1 score:%.6f'%(binary_f1))
    sentiment_f1 = f1_score(sentiment_validy, sentiment_preds, average=method)
    print('Sentiment f1 score:%.6f'%(sentiment_f1))
    true_f1 = f1_score(true_validy, true_preds, average=method)
    print('True f1 score:%.6f'%(true_f1))
    return true_f1

In [107]:
print('Learing global tfidf')
global_tfidf = TfidfVectorizer(ngram_range=(1,3),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
global_tfidf.fit_transform(total_train['content'])
print('Done\n')

model_dict = {}
f1_cnt = 0.0

for label in label_list[2:]:
    print(label)
    model_dict[label] = {}
    path = './process_data/'+label+'/'+label
    binary_train = pd.read_csv(path+'_binary_train.csv')
    binary_valid = pd.read_csv(path+'_binary_valid.csv')
    sentiment_train = pd.read_csv(path+'_sentiment_train.csv')
    sentiment_valid = pd.read_csv(path+'_sentiment_valid.csv')
    true_valid = pd.read_csv(path+'_true_valid.csv')
    
    binary_model, binary_preds = train_binary(binary_train, binary_valid, label, global_tfidf)  
    sentiment_model, sentiment_preds = train_sentiment(sentiment_train, sentiment_valid, label, global_tfidf)
    true_preds = predict_true_validy(binary_preds, sentiment_preds, sentiment_model, true_valid, global_tfidf)
    f1_cnt += valid_score(binary_preds, sentiment_preds, true_preds, binary_valid[label], sentiment_valid[label], true_valid[label], method='macro')
    
    model_dict[label]['binary_model'] = binary_model
    model_dict[label]['sentiment_model'] = sentiment_model
    
    print('-----------------------------------------------------------\n')
print("Total F1 score: %.6f"%(f1_cnt/len(label_list[2:])))

Learing global tfidf
Done

location_traffic_convenience
Binary f1 score:0.909223
Sentiment f1 score:0.445998
True f1 score:0.518536
-----------------------------------------------------------

location_distance_from_business_district
Binary f1 score:0.762434
Sentiment f1 score:0.338111
True f1 score:0.382753
-----------------------------------------------------------

location_easy_to_find


  'precision', 'predicted', average, warn_for)


Binary f1 score:0.849838
Sentiment f1 score:0.542367
True f1 score:0.552356
-----------------------------------------------------------

service_wait_time
Binary f1 score:0.770590
Sentiment f1 score:0.603667
True f1 score:0.534505
-----------------------------------------------------------

service_waiters_attitude
Binary f1 score:0.901887
Sentiment f1 score:0.659873
True f1 score:0.682335
-----------------------------------------------------------

service_parking_convenience
Binary f1 score:0.936759
Sentiment f1 score:0.507194
True f1 score:0.589327
-----------------------------------------------------------

service_serving_speed
Binary f1 score:0.815770
Sentiment f1 score:0.585592
True f1 score:0.552197
-----------------------------------------------------------

price_level
Binary f1 score:0.833629
Sentiment f1 score:0.648454
True f1 score:0.628894
-----------------------------------------------------------

price_cost_effective
Binary f1 score:0.839400
Sentiment f1 score:0.610913

In [108]:
def generate_test_output(model_dict, label_list, test_file, global_tfidf):
    test_tfidf = global_tfidf.transform(test_file['content'])
    for label in label_list[2:]:
        binary_preds = model_dict[label]['binary_model'].predict(test_tfidf)
        true_preds = model_dict[label]['sentiment_model'].predict(test_tfidf)
        for i,_binary in enumerate(binary_preds):
            true_preds[i] = -2 if _binary==0 else true_preds[i]
        tmp_df = pd.DataFrame(true_preds,columns=['tmp'])
        test_file[label] = tmp_df['tmp']
    return test_file

In [109]:
test_result = generate_test_output(model_dict, label_list, total_test, global_tfidf)
print(test_result.head(1))
test_result.to_csv('result.csv',index=False)

   id                                            content  \
0   0  " 我 想 说 他们 家 的 优惠活动 好 持久 啊 ， 我 预售 的 时候 买 的 券 ，...   

   location_traffic_convenience  location_distance_from_business_district  \
0                            -2                                        -2   

   location_easy_to_find  service_wait_time  service_waiters_attitude  \
0                     -2                 -2                         1   

   service_parking_convenience  service_serving_speed  price_level  \
0                           -2                     -2            0   

                ...                 environment_decoration  environment_noise  \
0               ...                                     -2                 -2   

   environment_space  environment_cleaness  dish_portion  dish_taste  \
0                 -2                     1             1           1   

   dish_look  dish_recommendation  others_overall_experience  \
0         -2                   -2                          1   
