In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter
from sklearn.cross_validation import ShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline

In [3]:
def make_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen2_3500.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]
    
    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])
    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    
    return (seodam_x, seodam_y, weight0)

def make_stopwords(stwd_dir='stopwords.txt'):
    stop_words = []
    with open(stwd_dir, 'r') as reader :
        stop_words0 = reader.readlines()
        stop_words1 = stop_words0[0].split(',')
    for words in stop_words1:
        stop_words.append(words.decode('utf-8'))
    
    return stop_words

def tokenize(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [9]:
def multinomial_recall_rate(X, y, weight=None, len_row=6422, rdst=0):
    model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))),
            ('clf', MultinomialNB())])
    cv = ShuffleSplit(len_row, random_state=rdst)
    recall_rate = cross_val_score(model, X, y, scoring='recall', cv=cv, fit_params={'clf__sample_weight' : weight})
    
    return recall_rate

In [10]:
X,y,weight = make_input()
recall_rate = multinomial_recall_rate(X,y)