/
classification.py
156 lines (135 loc) · 5.62 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from util import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
def label_reviews(data, text='review_text', label='review_rating'):
"""
label reviews by rating percentile, 0 for low, 1 for high
"""
# filter the reviews do not have the label value
data = data[data[label].notnull()]
label_based_col = data[label]
data['label'] = pd.qcut(label_based_col, [0, 1 / 2, 1], labels=False)
# data['label'] = np.where(data['review_rating'] >= 8.3, 1, 0)
# only keep the review text and label
data = data[[text, 'label']]
# reassigned the number of the dataframe
data = data.reset_index(drop=True)
return data
def get_tfidf(data):
tfidf_vect = TfidfVectorizer()
dtm = tfidf_vect.fit_transform(data['review_text'])
voc_lookup = {tfidf_vect.vocabulary_[word]: word for word in tfidf_vect.vocabulary_}
doc0 = dtm[0].toarray()[0]
top_words = doc0.argsort(doc0)[::-1][0:10]
print("top words for document")
for i in top_words:
print("{0}:\t{1:.3f}".format(voc_lookup[i], doc0[i]))
x_train, x_test, y_train, y_test = train_test_split(dtm, data["label"], test_size=0.3, random_state=0)
def create_model(x_train, y_train, x_test, y_test,
model_type="svm", min_df=1, stop_words=None,
print_result=True, algorithm_para=1.0):
# vectorize the train and test data
tfidf_vect = TfidfVectorizer(stop_words=stop_words, min_df=min_df)
x_train_tfidf = tfidf_vect.fit_transform(x_train)
if model_type == "svm":
model = svm.LinearSVC(C=algorithm_para)
clf = model.fit(x_train_tfidf, y_train)
elif model_type == "nb":
model = MultinomialNB(alpha
=algorithm_para)
clf = model.fit(x_train_tfidf, y_train)
else:
print("Please input correct model type")
return None
x_transformed = tfidf_vect.transform(x_test)
predict_p = clf.predict_proba(x_transformed) if model_type == "nb" \
else clf.decision_function(x_transformed)
predict = clf.predict(x_transformed)
if print_result:
print(classification_report(y_test, predict))
precision, recall, fscore, support = precision_recall_fscore_support(y_test, predict, average='binary')
# plot the roc and prc curve
fpr, tpr, thresholds = roc_curve(y_test, predict_p) if model_type == "svm" \
else roc_curve(y_test, predict_p[:, 1])
precision, recall, thresholds = precision_recall_curve(y_test, predict_p) if model_type == "svm" \
else precision_recall_curve(y_test, predict_p[:, 1])
prc_score = auc(recall, precision)
auc_score = auc(fpr, tpr)
if print_result:
print("AUC : {:.2%}".format(auc_score) + " PRC : {:.2%}".format(prc_score))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if model_type == "svm":
plt.title('SVM-AUC')
elif model_type == "nb":
plt.title('nb-AUC')
# plt.title('ROC curve')
# plt.legend(loc="lower right")
plt.show()
plt.plot(recall, precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
if model_type == "svm":
plt.title('SVM-PRC')
elif model_type == "nb":
plt.title('nb-PRC')
# plt.legend(loc="lower right")
plt.show()
return auc_score, prc_score
def search_para(docs, y):
# define the pipeline
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', svm.LinearSVC())
])
parameters = {'tfidf__min_df': [1, 2, 5, 10],
'tfidf__stop_words': [None, "english"],
'clf__C': [0.5, 1.0, 2.0],
}
# define the grid search
grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=5, scoring='f1_macro')
# fit the grid search
grid_search.fit(docs, y)
# report the best configuration
print(grid_search.best_params_)
print(grid_search.best_score_)
def sample_size_impact(docs, y):
svm_aucs = []
nb_aucs = []
for i in [float(i) / 10 for i in range(1, 9)]:
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=1 - i, random_state=0)
for mt in ['svm', 'nb']:
auc_score, prc_socre = create_model(x_train, y_train, x_test, y_test,
model_type=mt, stop_words='english', print_result=False)
if mt == 'svm':
svm_aucs.append(auc_score)
else:
nb_aucs.append(auc_score)
# plot a line chart show the relationship between sample size and the AUC score.
plt.plot([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], svm_aucs, label='svm')
plt.plot([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], nb_aucs, label='nb')
plt.xlabel('Sample Size')
plt.ylabel('AUC Score')
plt.legend()
plt.show()
def get_helpfulness_rate(data):
upvote = data['review_helpfulness_upvote']
total = data['review_helpfulness_total']
data['helpfulness_rate'] = upvote / total
return data