In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.cross_validation import ShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from konlpy.tag import Twitter

In [41]:
def make_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen2_3500.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]
    
    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])
    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    
    return (seodam_x, seodam_y, weight0)

def make_stopwords(stwd_dir='stopwords.txt'):
    stop_words = []
    with open(stwd_dir, 'r') as reader :
        stop_words0 = reader.readlines()
        stop_words1 = stop_words0[0].split(',')
    for words in stop_words1:
        stop_words.append(words.decode('utf-8'))
    
    return stop_words

def tokenize(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

def svc_recall_rate(X, y, knl='linear', stwd=None, len_row=6422, rdst=0):
    model = Pipeline([
            ('vect', Countvectorizer(tokenizer=tokenize, stop_words=stwd)),
            ('svc', SVC(kernel=knl))])

    cv = ShuffleSplit(len_row, random_state=rdst)
    recall_rate = cross_val_score(model, X, y, scoring='recall', cv=cv)
    
    return recall_rate

def make_report(X, y, weight=None, knl='linear', stwd=None, len_row=6422, rdst=0):
    model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize, stop_words=stwd)),
            ('svc', SVC(kernel=knl))])
    cv = ShuffleSplit(len_row, random_state=rdst)
    
    report_list=[]
    for k, (train_index, test_index) in enumerate(cv):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        weight0 = weight[train_index]

        model.fit(X_train, y_train, **{'svc__sample_weight' : weight0})
        result = model.predict(X_test)
        report = classification_report(y_test, result)
        report_list.append(report)
        print k
        print report
        print "*" * 50
        
    return report_list

In [22]:
X, y = make_input(f_dir, uf_dir, 3211)
test = make_report(X, y)

0
             precision    recall  f1-score   support

          0       0.74      0.73      0.73       311
          1       0.75      0.77      0.76       332

avg / total       0.75      0.75      0.75       643

**************************************************
1
             precision    recall  f1-score   support

          0       0.81      0.74      0.77       335
          1       0.74      0.81      0.78       308

avg / total       0.78      0.77      0.77       643

**************************************************
2
             precision    recall  f1-score   support

          0       0.81      0.76      0.78       319
          1       0.78      0.83      0.80       324

avg / total       0.79      0.79      0.79       643

**************************************************
3
             precision    recall  f1-score   support

          0       0.78      0.73      0.75       314
          1       0.75      0.81      0.78       329

avg / total       0.77      0.77 

In [40]:
X, y, weight = make_input()
test2 = make_report(X, y, weight=weight)

0
             precision    recall  f1-score   support

          0       0.75      0.72      0.73       311
          1       0.75      0.77      0.76       332

avg / total       0.75      0.75      0.75       643

**************************************************
1
             precision    recall  f1-score   support

          0       0.80      0.73      0.77       335
          1       0.73      0.80      0.77       308

avg / total       0.77      0.77      0.77       643

**************************************************
2
             precision    recall  f1-score   support

          0       0.81      0.75      0.78       319
          1       0.77      0.83      0.80       324

avg / total       0.79      0.79      0.79       643

**************************************************
3
             precision    recall  f1-score   support

          0       0.77      0.71      0.74       314
          1       0.75      0.80      0.77       329

avg / total       0.76      0.76 