-
Notifications
You must be signed in to change notification settings - Fork 0
/
task02.py
72 lines (49 loc) · 2.17 KB
/
task02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import pandas as pd
import sklearn.model_selection as skl_ms
import sklearn.linear_model as skl_lm
import sklearn.metrics as skl_m
import sklearn.feature_extraction.text as skl_text
def prepare(data):
data.pop("severe_toxic")
data.pop("obscene")
data.pop("threat")
data.pop("insult")
data.pop("identity_hate")
teach_data, test_data = skl_ms.train_test_split(data, test_size=0.2, random_state=5)
vectorizer = skl_text.CountVectorizer(lowercase=True, ngram_range=(1, 1), strip_accents='unicode',
stop_words={'english'}, analyzer='word')
vectorizer.fit(data['comment_text'])
X = vectorizer.transform(teach_data["comment_text"])
y = teach_data['toxic']
X_test = vectorizer.transform(test_data["comment_text"])
y_test = test_data['toxic']
return X, y, X_test, y_test
def start():
print("Read data from csv")
data = pd.read_csv("train.csv")
print("Prepare data")
X, y, X_test, y_test = prepare(data)
print("Start regression")
regression = skl_lm.LogisticRegression(random_state=0, max_iter=1000)
regression.fit(X, y)
print("Start predict")
y_predict = regression.predict(X_test)
classification_report = skl_m.classification_report(y_test, y_predict)
f1 = skl_m.f1_score(y_test, y_predict, average='macro')
print("Classification_report\n ", classification_report)
print("f1:", f1)
print("-"*20, "\nStart grid search")
f1_scorer = skl_m.make_scorer(skl_m.f1_score, average='macro')
grid_search = skl_ms.GridSearchCV(skl_lm.LogisticRegression(random_state=0, penalty='l2', max_iter=1000),
dict(C=np.arange(0.01, 1, 0.1)),
scoring=f1_scorer)
grid_search.fit(X, y)
print("Start predict\n")
y_test_predict = grid_search.predict(X_test)
cr_grid_search = skl_m.classification_report(y_test, y_test_predict)
f1_grid_search = skl_m.f1_score(y_test, y_test_predict, average='macro')
print("Classification_report for grid search\n ", cr_grid_search)
print("f1 for grid search:", f1_grid_search)
if __name__ == '__main__':
start()