In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter
from konlpy.tag import Kkma
import pandas as pd
import numpy as np

In [2]:
df_frozen = pd.read_csv('./files/seodam_together_notags0323.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./files/unfrozen2_3500.csv').drop(['Unnamed: 0'], axis=1)[:3213]

In [7]:
print df_unfrozen.head(0), len(df_unfrozen['date'])
print 
print df_frozen.head(0), len(df_frozen['freeze'])

Empty DataFrame
Columns: [date, text2]
Index: [] 3213

Empty DataFrame
Columns: [freeze, text]
Index: [] 3213


In [3]:
unfrozen = np.array(df_unfrozen['text2'])
frozen = np.array(df_frozen['text'])
seodam_x = np.append(unfrozen, frozen)
seodam_y = np.append(np.zeros(3213, dtype=int), np.ones(3213, dtype=int))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(seodam_x, seodam_y, test_size=0.1, random_state=2)

### 냉동이 아니면 0, 냉동이면 1로 구분한 X,y dataframe 생성

In [6]:
vect = CountVectorizer()
vect.fit(X_train)
X_train = vect.transform(X_train)

In [7]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
X_test = vect.transform(X_test)

In [9]:
result = clf.predict(X_test)

In [10]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.74      0.95      0.83       328
          1       0.62      0.22      0.32       137

avg / total       0.71      0.73      0.68       465


[[310  18]
 [107  30]]


### 형태소 분석기 사용(Twitter, noun)

In [5]:
pos_tagger = Twitter()
def tokenize(doc):
    return pos_tagger.nouns(doc)

In [6]:
vect = CountVectorizer(tokenizer=tokenize)
vect.fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [7]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
result = clf.predict(X_test)

In [9]:
print classification_report(y_test, result)
print '*' * 50
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.83      0.69      0.75       316
          1       0.74      0.86      0.79       327

avg / total       0.78      0.77      0.77       643

**************************************************
[[217  99]
 [ 46 281]]


### 형태소 분석기 사용(Twitter)

In [35]:
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [38]:
vect = CountVectorizer(tokenizer=tokenize)
vect.fit(X_train)
X_train = vect.transform(X_train)

In [39]:
X_test = vect.transform(X_test)

In [40]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
result = clf.predict(X_test)

In [42]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.80      0.79      0.79       314
          1       0.80      0.81      0.81       329

avg / total       0.80      0.80      0.80       643


[[247  67]
 [ 61 268]]


### 형태소 분석기(꼬꼬마)

In [22]:
pos_tagger = Kkma()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc)]

In [23]:
vect = CountVectorizer(tokenizer=tokenize)
vect.fit(X_train)
X_train = vect.transform(X_train)

In [13]:
MultinomialNB?

In [24]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
X_test = vect.transform(X_test)

In [26]:
result = clf.predict(X_test)

In [27]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.91      0.63      0.74       328
          1       0.49      0.85      0.62       137

avg / total       0.79      0.69      0.71       465


[[206 122]
 [ 20 117]]


### TF-idf

In [7]:
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [8]:
tfidv = TfidfVectorizer(tokenizer=tokenize)
tfidv.fit(X_train)
X_train = tfidv.transform(X_train)

In [9]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
X_test = tfidv.transform(X_test)

In [11]:
result = clf.predict(X_test)

In [12]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.71      1.00      0.83       328
          1       1.00      0.01      0.03       137

avg / total       0.79      0.71      0.59       465


[[328   0]
 [135   2]]
