In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter

In [7]:
df_frozen = pd.read_csv('./files/seodam_together_notags0323.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./files/unfrozen2_3500.csv').drop(['Unnamed: 0'], axis=1)[:3213]

In [36]:
unfrozen = np.array(df_unfrozen['text2'])
frozen = np.array(df_frozen['text'])
seodam_x = np.append(unfrozen, frozen)
seodam_y = np.append(np.zeros(3213, dtype=int), np.ones(3213, dtype=int))

X_train, X_test, y_train, y_test = train_test_split(seodam_x, seodam_y, test_size=0.1, random_state=2)

In [11]:
with open('stopwords.txt', 'r') as reader :
    stop_words0 = reader.readlines()
    
stop_words = stop_words0[0].split(',')

In [14]:
stop_words2 = []
for words in stop_words:
    stop_words2.append(words.decode('utf-8'))

In [17]:
pprint(stop_words2[:15])

[하다/Verb,
 들/Suffix,
 있다/Adjective,
 되다/Verb,
 아니다/Adjective,
 없다/Adjective,
 ㅋㅋ/KoreanParticle,
 것/Noun,
 같다/Adjective,
 적/Suffix,
 사람/Noun,
 보다/Verb,
 생각/Noun,
 말/Noun,
 그렇다/Adjective]


### Stop words 없이

In [37]:
tagger = Twitter()
def tokenize(doc):
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [38]:
vect = CountVectorizer(tokenizer=tokenize)
vect.fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [39]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
result = clf.predict(X_test)

In [41]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.81      0.78      0.80       316
          1       0.80      0.83      0.81       327

avg / total       0.80      0.80      0.80       643


[[247  69]
 [ 57 270]]


### Twitter 사용

In [9]:
tagger = Twitter()
def tokenize(doc):
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [24]:
vect = CountVectorizer(tokenizer=tokenize, stop_words=stop_words2[:15])
vect.fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [25]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
result = clf.predict(X_test)

In [27]:
print classification_report(y_test, result)
print 
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.81      0.79      0.80       316
          1       0.80      0.82      0.81       327

avg / total       0.80      0.80      0.80       643


[[250  66]
 [ 60 267]]


### TF-IDF

In [30]:
tfidv = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words2[:15])
X_train = tfidv.fit_transform(X_train)
X_test = tfidv.transform(X_test)

In [31]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
result = clf.predict(X_test)

In [33]:
print classification_report(y_test, result)
print
print confusion_matrix(y_test, result)

             precision    recall  f1-score   support

          0       0.78      0.86      0.82       316
          1       0.85      0.76      0.81       327

avg / total       0.82      0.81      0.81       643


[[273  43]
 [ 77 250]]
