### 냉동 vs 비냉동 많이 나온 어휘 차이

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from konlpy.tag import Twitter
from konlpy.tag import Kkma
from collections import Counter

In [5]:
df_frozen = pd.read_csv('./private/files/seodam_together_notags0322.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./private/files/unfrozen_mixed0402.csv').drop(['Unnamed: 0'], axis=1)

### 냉동 단어들 vs 비 냉동 단어들 비교

In [6]:
def tokenize_basic(doc):
    pos_tagger = Twitter()
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [7]:
frozen_str = ' '.join(df_frozen.text.values).decode('utf-8')
unfrozen_str = ' '.join(df_unfrozen.text2.values).decode('utf-8')

In [8]:
frozen_words = tokenize_filtered(frozen_str)
unfrozen_words = tokenize_filtered(unfrozen_str)

In [9]:
frozen_count = Counter(frozen_words)
unfrozen_count = Counter(unfrozen_words)

In [10]:
diff_frozen = {}
for k, v in frozen_count.items():
    if k in unfrozen_words:
        diff_frozen[k] = v - unfrozen_count[k]
    else :
        diff_frozen[k] = v

diff_unfrozen = {}
for k, v in unfrozen_count.items():
    if k in frozen_words:
        diff_unfrozen[k] = v - frozen_count[k]
    else:
        diff_unfrozen[k] = v

In [13]:
frozen_sort = sorted(diff_frozen.items(), key=lambda x: x[1], reverse=True)
unfrozen_sort = sorted(diff_unfrozen.items(), key=lambda x: x[1], reverse=True)

워드클라우드 생성

In [19]:
import pytagcloud

tag_frozen = frozen_sort[:20]
tag_unfrozen = unfrozen_sort[:20]

taglist = pytagcloud.make_tags(tag_frozen, maxsize=80)

pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(900, 600))

### Stop words 파일 생성

In [36]:
frozen = np.array(df_frozen['text'])
frozen_str = ' '.join(frozen).decode('utf-8')
unfrozen = np.array(df_unfrozen['text2'])
unfrozen_str = ' '.join(unfrozen).decode('utf-8')

In [38]:
words = tokenize_basic(frozen_str)
counter = Counter(words)

In [39]:
stop_words = []
for k, v in counter.most_common(100):
    stop_words.append(k)

In [71]:
with open('stopwords.txt', 'w') as words :
    words.write(','.join(stop_words).encode('utf-8'))