In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter
from konlpy.tag import Kkma
from collections import Counter

def cv_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen_mixed0402.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]

    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])

    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    return (seodam_x, seodam_y, weight0)

def make_stopwords(stwd_dir='stopwords.txt'):
    stop_words = []
    with open(stwd_dir, 'r') as reader :
        stop_words0 = reader.readlines()
        stop_words1 = stop_words0[0].split(',')
    for words in stop_words1:
        stop_words.append(words.decode('utf-8'))
    return stop_words

def tokenize_basic(doc):
    pos_tagger = Twitter()
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [22]:
X, y, w = cv_input()

In [26]:
with open('corpus.txt', 'r') as corp:
    corpus = corp.read().decode('utf-8')

In [None]:
lst = tokenize_filtered(corpus)

In [2]:
df_frozen = pd.read_csv('./private/files/seodam_together_notags0326.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./private/files/unfrozen2_3500.csv').drop(['Unnamed: 0'], axis=1)[:3211]

### 조사, 문장부호 제거

In [61]:
df_unfrozen

Unnamed: 0,date,text2
0,2016/10/18 00:03,오 imgur 이거 이머저라고 읽는구낭 지금까지 임구르인줄
1,2016/10/18 00:04,회계학원론 공부중인데 5장 정산표랑 마감분개 문제푸는데 자꾸 말도안되는 숫자 ...
2,2016/10/18 00:11,인문대 오버워치 리그 규정 보고 생각난건데 굳이 자기 계정이어야하는 이유가 뭘까? ...
3,2016/10/18 00:15,유튜브에서 이라크 모술 탈환전 생중계중 https://youtu.be/dtl8Not...
4,2016/10/18 00:21,허정 교수님 경원 영강이라서 절평일텐데 그래도 한문제 틀려서 팍팍 학점...


In [4]:
unfrozen = np.array(df_unfrozen['text2'])
frozen = np.array(df_frozen['text'])

In [60]:
tagger = Twitter()
base = ''
for words in unfrozen[:10]:
    base += ' ' + words.decode('utf-8')

In [58]:
tagger = Twitter()
def tokenize(doc):
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

### Stop words 추가

In [4]:
unfrozen = np.array(df_unfrozen['text2'])
frozen = np.array(df_frozen['text'])
seodam_x = np.append(unfrozen[:20], frozen[:20])
seodam_y = np.append(np.zeros(20, dtype=int), np.ones(20, dtype=int))

In [11]:
tagger = Twitter()
def tokenize(doc):
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [16]:
model = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('clf', MultinomialNB())
])

In [19]:
cv = ShuffleSplit(40, n_iter=5, random_state=2)
for k, (train_index, test_index) in enumerate(cv) :
    X_train0 = seodam_x[train_index]
    y_train = seodam_y[train_index]
    X_test0 = seodam_x[test_index]
    
    model.fit(X_train0, y_train)
    result = model.predict(X_test0)
    print classification_report(seodam_y[test_index], result)
    print "*" * 50

             precision    recall  f1-score   support

          0       1.00      0.33      0.50         3
          1       0.33      1.00      0.50         1

avg / total       0.83      0.50      0.50         4

**************************************************
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         3
          1       0.25      1.00      0.40         1

avg / total       0.06      0.25      0.10         4

**************************************************
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       0.33      0.50      0.40         2

avg / total       0.17      0.25      0.20         4

**************************************************
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.67      0.67      0.67         3

avg / total       0.50      0.50      0.5

In [25]:
cv = ShuffleSplit(40, n_iter=5, random_state=2)
score = cross_val_score(model,seodam_x, seodam_y, scoring=['f1_weighted', 'precision'], cv=cv)

TypeError: 'list' object is not callable

In [24]:
score

array([ 0.5       ,  0.1       ,  0.2       ,  0.5       ,  0.73333333])

In [8]:
cv = StratifiedKFold(seodam_y, n_folds=10, random_state=2 )
for k, (train_idx, test_idx) in enumerate(cv) :
    print k
    print 'train_idx: ', train_idx
    print 'test_idx: ', test_idx
    print "*" *50

0
train_idx:  [ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 22 23 24 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39]
test_idx:  [ 0  1 20 21]
**************************************************
1
train_idx:  [ 0  1  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 24 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39]
test_idx:  [ 2  3 22 23]
**************************************************
2
train_idx:  [ 0  1  2  3  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 26 27 28
 29 30 31 32 33 34 35 36 37 38 39]
test_idx:  [ 4  5 24 25]
**************************************************
3
train_idx:  [ 0  1  2  3  4  5  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 28
 29 30 31 32 33 34 35 36 37 38 39]
test_idx:  [ 6  7 26 27]
**************************************************
4
train_idx:  [ 0  1  2  3  4  5  6  7 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 30 31 32 33 34 35 36 37 38 39]
test_idx:  [ 8  9 28 29]
**********************************************

In [10]:
result = [1,0,1,0,1,0,1,0,1,0]
y_test = [1,0,0,1,1,1,1,0,1,0]
xx = confusion_matrix(y_test, result)