In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter
from konlpy.tag import Kkma
import pandas as pd
import numpy as np

In [45]:
def basic_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen2_3500.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]

    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])
    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(3211, dtype=int), np.ones(3211, dtype=int))

    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(seodam_x, seodam_y, weight0, test_size=0.1, random_state=0)
    
    return X_train, X_test, y_train, y_test, w_train

def tokenize_basic(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

def tokenize_basic(doc):
    pos_tagger = Twitter()
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [46]:
def get_one_report(X_train, y_train, X_test, y_test):
    model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('clf', MultinomialNB())])
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    return classification_report(y_test, result)

def get_conf_matrix(X_train, y_train, X_test, y_test):
    model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('clf', MultinomialNB())])
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    return confusion_matrix(y_test, result)

def get_one_report_tfidv(X_train, y_train, X_test, y_test):
    model_tfidv = Pipeline([
                ('tfidv', TfidfVectorizer(tokenizer=tokenize)),
                ('clf', MultinomialNB())])
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    return classification_report(y_test, result)

In [41]:
def misclassificated_samples(X_train, y_train, X_test, y_test):
    model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('clf', MultinomialNB())])
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    mask_fn = np.logical_and(y_test==0, result==1)
    mask_fp = np.logical_and(y_test==1, result==0)
    
    false_negative = [text.decode('utf-8') for text in X_text[mask_fn]]
    false_positive = [text.decode('utf-8') for text in X_test[mask_fp]]
    
    return false_negative, false_positive

In [48]:
X_train, X_test, y_train, y_test, weight = basic_input()
test = get_conf_matrix(X_train, y_train, X_test, y_test)

In [51]:
print float(test[1,1]) / (test[1,0] + test[1,1])

0.801204819277
