In [20]:
import csv

# import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
def read_csv_column(col):
    with open('./sentiment_lemma.txt') as file:
        reader = csv.reader(file, delimiter='\t')
        header = next(reader)
        return [row[col] for row in reader]    

In [4]:
x_all = read_csv_column(1)
y_all = read_csv_column(0)

In [5]:
cv = CountVectorizer(min_df=0.0005, max_df=0.10)

In [6]:
x_all_cv = cv.fit_transform(x_all)
x_train, x_test, y_train, y_test = train_test_split(x_all_cv, y_all)

In [22]:
lr = LogisticRegression()
#lr = LogisticRegression(solver='sag', max_iter=10000, verbose=2)

In [46]:
tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'solver': ['newton-cg']},
    {'C': [1, 10, 100, 1000], 'solver': ['lbfgs']},
    {'C': [1, 10, 100, 1000], 'solver': ['liblinear'], 'penalty':['l1', 'l2']},
    {'C': [1, 10, 100, 1000], 'solver': ['sag']},
    {'C': [1, 10, 100, 1000], 'solver': ['saga'], 'penalty':['l1', 'l2']}
    ]

In [47]:
score = 'f1'
clf = GridSearchCV(
    lr, # 識別器
    tuned_parameters, # 最適化したいパラメータセット 
    cv=5, # 交差検定の回数
    scoring='%s_weighted' % score ) # モデルの評価関数の指定

In [48]:
%%time
clf.fit(x_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'solver': ['newton-cg']},
                         {'C': [1, 10, 100, 1000], 'solver': ['lbfgs']},
                         {'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2'],
                          'solver': ['liblinear']},
                         {'C': [1, 10, 100, 1000], 'solver': ['sa

In [62]:
print(clf.best_params_)
print(clf.best_score_)

{'C': 1, 'solver': 'newton-cg'}
0.7275942635260467


In [64]:
# スコアの一覧を取得
gs_result = pd.DataFrame.from_dict(clf.cv_results_)
gs_result.to_csv('gs_result.csv')

In [31]:
len(cv.get_feature_names())

3348

In [32]:
 # 最高性能のモデルを取得し、テストデータを分類
best = clf.best_estimator_
pred = best.predict(x_test)
clf.score(x_test, y_test)

0.7457096170380763

In [33]:
clf.score(x_train, y_train)

0.8913224026566202

In [34]:
y_pred = clf.predict(x_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75      1366
           1       0.74      0.74      0.74      1300

    accuracy                           0.75      2666
   macro avg       0.75      0.75      0.75      2666
weighted avg       0.75      0.75      0.75      2666



In [36]:
print(confusion_matrix(y_test, y_pred))

[[1021  345]
 [ 333  967]]
