In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from konlpy.tag import Kkma
from konlpy.tag import Komoran

In [2]:
education = pd.read_csv("교육청민원.csv")
traffic = pd.read_csv("국토교통부민원.csv")
sea = pd.read_csv("해양수산부민원.csv")

In [3]:
# 질문과 내용을 합침

e_complaint = pd.DataFrame(education['qnaTitl'] + ' ' + education['qstnCntnCl'], columns = ['complaint'])
t_complaint = pd.DataFrame(traffic['qnaTitl'] + ' ' + traffic['qstnCntnCl'], columns = ['complaint'])
s_complaint = pd.DataFrame(sea['qnaTitl'] + ' ' + sea['qstnCntnCl'], columns = ['complaint'])

In [4]:
# 라벨링

e_complaint['label'] = 0
t_complaint['label'] = 1
s_complaint['label'] = 2

In [5]:
합침

df = pd.concat([e_complaint, t_complaint, s_complaint], axis = 0)
df = df.reset_index().drop('index', axis = 1)

In [6]:
# 불용어 긁어오기

stopwords = []

url = 'https://www.ranks.nl/stopwords/korean'
raw = requests.get(url, verify = False)

soup = BeautifulSoup(raw.text, 'html.parser')

soup = soup.find_all('td')

for i in soup:
    for j in i:
        if j.string == None:
            continue
        else:
            stopwords.append(str(j))

stopwords.append('경우')
stopwords.append('인가요')
stopwords.append('되나요')
stopwords.append('있나요')
stopwords.append('하나요')
stopwords.append('하는')
stopwords.append('궁금합니다')
            
stopwords[:5]



['아', '휴', '아이구', '아이쿠', '아이고']

In [9]:
# 정규표현식으로 한글만 남기기

def clean(dataframe):
    return [re.sub(r'[^ 가-힣]', '', c) for c in dataframe]

In [10]:
df['complaint'] = clean(df['complaint'])

In [11]:
# 여러가지 한국어 처리 함수 만들기

def okt(text):
    okt = Okt()
    text = okt.morphs(text)
    return [c for c in text if len(c) > 1]

def kkma(text):
    kkma = Kkma()
    text = kkma.morphs(text)
    return [c for c in text if len(c) > 1]

def komoran(text):
    komoran = Komoran()
    text = komoran.morphs(text)
    return [c for c in text if len(c) > 1]

In [12]:
X = df['complaint']
y = df['label']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1, stratify=y) 

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [17]:
tfidf = TfidfVectorizer()

#불용어하고

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stopwords, None],
               'vect__tokenizer': [okt, kkma, komoran],
               'clf__penalty': ['l1', 'none'],
               'clf__C': [1.0, 5.0, 10.0]}
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression())])


gs_lr_tfidf = RandomizedSearchCV(lr_tfidf,
                                 param_grid,
                                 scoring='accuracy',
                                 cv = 5,
                                 n_iter = 5,
                                 verbose = 2,
                                 n_jobs = -1)

In [18]:
gs_lr_tfidf

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                             ('clf', LogisticRegression())]),
                   n_iter=5, n_jobs=-1,
                   param_distributions=[{'clf__C': [1.0, 5.0, 10.0],
                                         'clf__penalty': ['l1', 'none'],
                                         'vect__ngram_range': [(1, 1)],
                                         'vect__stop_words': [['아', '휴', '아이구',
                                                               '아이쿠', '아이고',
                                                               '어', '나', '우리',
                                                               '저희', '따라', '의해',
                                                               '을', '를', '에',
                                                               '의', '가', '으로',
                                                               '로', '에게', '뿐이다',
              

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
