In [26]:
from konlpy.tag import Twitter
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.learning_curve import validation_curve
from sklearn.grid_search import GridSearchCV




In [3]:
def cv_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen_mixed0402.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]

    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])

    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    return (seodam_x, seodam_y, weight0)

def tokenize_basic(doc):
    pos_tagger = Twitter()
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [8]:
X0, y, weight = cv_input()

In [13]:
param_range=np.linspace(1,3,5)

In [9]:
vect = CountVectorizer(tokenizer=tokenize_filtered)
X = vect.fit_transform(X0) 

In [15]:
train_score2, test_score2 = validation_curve(SVC(kernel='linear'), X, y, param_name="C", param_range=param_range, cv=5, scoring='recall', n_jobs=2)

In [17]:
test_score2[1]

array([[ 0.63608087,  0.59968847,  0.56697819,  0.61370717,  0.57788162],
       [ 0.62986003,  0.59501558,  0.57009346,  0.61838006,  0.59190031],
       [ 0.63297045,  0.59657321,  0.5623053 ,  0.60903427,  0.59034268],
       [ 0.63763608,  0.59968847,  0.5529595 ,  0.61370717,  0.58878505],
       [ 0.63452566,  0.60903427,  0.55140187,  0.59968847,  0.58566978]])

In [12]:
test_score

array([[ 0.12597201,  0.00623053,  0.        ,  0.00623053,  0.        ],
       [ 0.01710731,  0.00623053,  0.        ,  0.00311526,  0.        ],
       [ 0.01399689,  0.01557632,  0.        ,  0.00311526,  0.        ],
       [ 0.06531882,  0.06697819,  0.        ,  0.00311526,  0.00155763],
       [ 0.18195956,  0.17757009,  0.02492212,  0.03115265,  0.03582555],
       [ 0.36702955,  0.33489097,  0.13395639,  0.1588785 ,  0.15109034],
       [ 0.50388802,  0.47040498,  0.28193146,  0.35981308,  0.27725857],
       [ 0.58631415,  0.5482866 ,  0.42523364,  0.51090343,  0.3894081 ],
       [ 0.6407465 ,  0.605919  ,  0.54049844,  0.57009346,  0.53115265],
       [ 0.63608087,  0.59968847,  0.56697819,  0.61370717,  0.57788162]])

In [28]:
model = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize_filtered)),
        ('clf', SVC())])

In [32]:
param_range = np.logspace(-4,3,8)
param_grid = [{'clf__C': param_range, 'clf__gamma':param_range, 'clf__kernel':['rbf']}]

cv = ShuffleSplit(6422)
gs = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'clf__sample_weigth':weight}, cv=cv, scoring='recall', n_jobs=2)

In [None]:
gs.fit(X,y)