In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter

In [29]:
def make_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen2_3500.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]
    
    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])
    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    
    return (seodam_x, seodam_y)

def make_stopwords(stwd_dir='stopwords.txt'):
    stop_words = []
    with open(stwd_dir, 'r') as reader :
        stop_words0 = reader.readlines()
        stop_words1 = stop_words0[0].split(',')
    for words in stop_words1:
        stop_words.append(words.decode('utf-8'))
    
    return stop_words

def tokenize(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

### Stop words 없이

In [13]:
model = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('clf', MultinomialNB())])

sample_weight 없이

In [19]:
cv = ShuffleSplit(6422, random_state=0)
recall_rate = cross_val_score(model, seodam_x, seodam_y, scoring='recall', cv=cv)

In [21]:
print recall_rate
print recall_rate.mean()

[ 0.80120482  0.7987013   0.83024691  0.79635258  0.8496732   0.81213873
  0.81168831  0.81927711  0.8044164   0.81987578]
0.814357514639


sample_weight 추가

In [23]:
cv = ShuffleSplit(6422, random_state=0)
recall_rate = cross_val_score(model, seodam_x, seodam_y, scoring='recall', cv=cv, fit_params={'clf__sample_weight' : weight0})

In [25]:
print recall_rate
print recall_rate.mean()

[ 0.81626506  0.81168831  0.84567901  0.80547112  0.84313725  0.82080925
  0.8474026   0.84638554  0.84542587  0.85093168]
0.833319569645


In [28]:
cv = ShuffleSplit(6422, random_state=0)
for k, (train_index, test_index) in enumerate(cv):
    X_train0 = seodam_x[train_index]
    y_train = seodam_y[train_index]
    X_test0 = seodam_x[test_index]
    
    model.fit(X_train0, y_train, **{"clf__sample_weight" : weight0[train_index]})
    result = model.predict(X_test0)
    print k
    print classification_report(seodam_y[test_index], result)
    print "*" * 50

0
             precision    recall  f1-score   support

          0       0.80      0.78      0.79       311
          1       0.80      0.82      0.81       332

avg / total       0.80      0.80      0.80       643

**************************************************
1
             precision    recall  f1-score   support

          0       0.80      0.71      0.76       335
          1       0.72      0.81      0.76       308

avg / total       0.77      0.76      0.76       643

**************************************************
2
             precision    recall  f1-score   support

          0       0.83      0.78      0.81       319
          1       0.80      0.85      0.82       324

avg / total       0.81      0.81      0.81       643

**************************************************
3
             precision    recall  f1-score   support

          0       0.79      0.78      0.79       314
          1       0.80      0.81      0.80       329

avg / total       0.79      0.79 

### Twitter 사용

In [9]:
tagger = Twitter()
def tokenize(doc):
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [24]:
vect = CountVectorizer(tokenizer=tokenize, stop_words=stop_words2[:15])
vect.fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [25]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
result = clf.predict(X_test)

In [27]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.81      0.79      0.80       316
          1       0.80      0.82      0.81       327

avg / total       0.80      0.80      0.80       643


[[250  66]
 [ 60 267]]


### TF-IDF

In [30]:
tfidv = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words2[:15])
X_train = tfidv.fit_transform(X_train)
X_test = tfidv.transform(X_test)

In [31]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
result = clf.predict(X_test)

In [33]:
print classification_report(y_test, result)
print
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.78      0.86      0.82       316
          1       0.85      0.76      0.81       327

avg / total       0.82      0.81      0.81       643


[[273  43]
 [ 77 250]]
