<a href="https://colab.research.google.com/github/amy-hyunji/korean_multi_label_SA/blob/master/BOG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data loading and cleaning

%matplotlib inline
%config InlineBackend.figure_format = "retina"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize, TweetTokenizer

from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

from google.colab import files
import io

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

Saving final_remove_dup_no_neutral_test.csv to final_remove_dup_no_neutral_test.csv
Saving final_remove_dup_no_neutral_train.csv to final_remove_dup_no_neutral_train.csv
User uploaded file "final_remove_dup_no_neutral_test.csv" with length 695403 bytes
User uploaded file "final_remove_dup_no_neutral_train.csv" with length 2847879 bytes


In [0]:
train_df = pd.read_csv(io.StringIO(uploaded["final_remove_dup_no_neutral_train.csv"].decode("utf-8")))
test_df = pd.read_csv(io.StringIO(uploaded["final_remove_dup_no_neutral_test.csv"].decode("utf-8")))

X_train = train_df["sentence"].tolist()
y_train = train_df["label"].tolist()

X_test = test_df["sentence"].tolist()
y_test = test_df["label"].tolist()

In [0]:
def tokenize(text): 
    return word_tokenize(text) # regex_token

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = None)
 

kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="rbf", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="f1_macro",
                    verbose=1,   
                    n_jobs=-1)

In [14]:
%time grid_svm = grid_svm.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KeyboardInterrupt: ignored

In [9]:
grid_svm.score(X_test, y_test)

0.6040400275240534

In [10]:
grid_svm.best_params_

{'svc__C': 1}

In [11]:
grid_svm.best_score_

0.5957950283932433

In [0]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred, average=None)
    prec = precision_score(y, pred, average=None)
    rec = recall_score(y, pred, average=None)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [13]:
report_results(grid_svm.best_estimator_, X_test, y_test)

{'acc': 0.6057255676209279,
 'f1': array([0.65456652, 0.61592795, 0.55240913, 0.59325651]),
 'precision': array([0.65242494, 0.56458401, 0.6188447 , 0.5963106 ]),
 'recall': array([0.6567222 , 0.67754468, 0.49885496, 0.59023355])}