<a href="https://colab.research.google.com/github/amy-hyunji/korean_multi_labeled_SA/blob/master/BOG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# data loading and cleaning

%matplotlib inline
%config InlineBackend.figure_format = "retina"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, regexp_tokenize, TweetTokenizer

from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

from google.colab import files
import io

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

Saving final_without_neutral_test.csv to final_without_neutral_test (1).csv
Saving final_without_neutral_train.csv to final_without_neutral_train (1).csv
User uploaded file "final_without_neutral_test.csv" with length 198815 bytes
User uploaded file "final_without_neutral_train.csv" with length 4938603 bytes


In [0]:
train_df = pd.read_csv(io.StringIO(uploaded["final_without_neutral_train.csv"].decode("utf-8")))
test_df = pd.read_csv(io.StringIO(uploaded["final_without_neutral_test.csv"].decode("utf-8")))

X_train = train_df["sentence"].tolist()
y_train = train_df["label"].tolist()

X_test = test_df["sentence"].tolist()
y_test = test_df["label"].tolist()

In [0]:
def tokenize(text): 
    return word_tokenize(text) # regex_token

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = None)
 

kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="rbf", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1, 2, 3, 5, 10]}, 
                    cv = kfolds,
                    scoring="f1_macro",
                    verbose=1,   
                    n_jobs=-1)

In [0]:
%time grid_svm = grid_svm.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [0]:
grid_svm.score(X_test, y_test)

In [0]:
grid_svm.best_params_

In [0]:
grid_svm.best_score_

In [0]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred, average=None)
    prec = precision_score(y, pred, average=None)
    rec = recall_score(y, pred, average=None)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [0]:
report_results(grid_svm.best_estimator_, X_test, y_test)