# data는 e9t(Lucy Park)님께서 github에 공유해주신 네이버 영화평점 데이터를 사용하였습니다.
# https://github.com/e9t/nsmc

In [1]:
# data를 읽어오는 함수
def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            _, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))

    return txt_ls, label_ls

In [2]:
x_train, y_train = read_txt('../ratings_train.txt')
x_test, y_test = read_txt('../ratings_test.txt')

In [3]:
len(x_train), len(y_train), len(x_test), len(y_test)

(150000, 150000, 50000, 50000)

In [4]:
x_train[0]

'아 더빙.. 진짜 짜증나네요 목소리'

## 파이프라인 (Pipeline) 생성

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

## 그리드 서치

In [None]:
from sklearn.model_selection import GridSearchCV

In [22]:
parameters = {
    'tfidf__ngram_range' : ((1,1),(1,2),(1,3)),
    'tfidf__min_df' : (1, 3,10),
    'tfidf__max_df' : (0.1, 0.25, 0.5),
    'clf__solver' : ('lbfgs', 'saga', 'newton-cg')
}

grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, verbose=10)
grid_search.fit(x_train, y_train)

Fitting 2 folds for each of 81 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   33.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   47.3s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  6.7min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  7.6min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  8.5min
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:  8.8min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'clf__solver': ('lbfgs', 'saga', 'newton-cg'), 'tfidf__max_df': (0.1, 0.25, 0.5), 'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__min_df': (1, 3, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

# Test

In [36]:
grid_search.best_params_

{'clf__solver': 'saga',
 'tfidf__max_df': 0.25,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1)}

In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [33]:
best_model = grid_search.best_estimator_

In [34]:
predict = best_model.predict(x_test)
accuracy = accuracy_score(y_test, predict)

In [35]:
print('Accuracy : ',accuracy)
print(classification_report(y_test, predict))

Accuracy :  0.81242
              precision    recall  f1-score   support

           0       0.79      0.84      0.82     24827
           1       0.83      0.78      0.81     25173

   micro avg       0.81      0.81      0.81     50000
   macro avg       0.81      0.81      0.81     50000
weighted avg       0.81      0.81      0.81     50000

