- 루신, 쏠라 : 구글에서 검색어에 대한 형태소를 분석하기 위해 만든 라이브러리
- konlpy : 루신과 쏠라를 기반으로 만들어진 한국어 형태소 분석 도구
- 페이스북, 트위터, 서울대 등등 각각의 단체에서 만든 분석기를 모아 놓았다.
- soynlp : 명사를 추출하는데 목적어 있고 신조어에 대응이 가능하다. 워드클라우드나 명사만 추출하고자 할 때 사용.
- konlpy : 문장의 모든 품사 등을 추출하고자 할 때 사용한다.

In [1]:
# 기본
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게...
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False

# 시간 관리를 위한 라이브러리
import datetime

# 데이터 수집
from pandas_datareader import data
import requests
from bs4 import BeautifulSoup
import re
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

# 화면 청소
from IPython.display import clear_output

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV

# 분류 알고리즘
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 회귀 알고리즘
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 분류용 평가 함수
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용 평가 함수
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 차원 축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift

# 시계열 분석
from pmdarima.arima import auto_arima
from fbprophet import Prophet

# 형태소 분석도구
from soynlp.utils import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor_v2
from soynlp.noun import LRNounExtractor
from soynlp.noun import NewsNounExtractor

from konlpy.tag import Okt, Hannanum, Kkma, Mecab, Komoran

# 워드 클라우드
from collections import Counter
import pytagcloud
from IPython.display import Image

# 상관관계
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 추천
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

pygame 2.1.2 (SDL 2.0.18, Python 3.8.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
# 형태소 분석기를 설치한다.
tagger1 = Okt()
tagger2 = Kkma()
tagger3 = Hannanum()
tagger4 = Komoran()

In [4]:
str1 = '동해물과 백두산이 마르고 닳도록'

print(tagger1.morphs(str1))
print(tagger2.morphs(str1))
print(tagger3.morphs(str1))
print(tagger4.morphs(str1))

['동', '해물', '과', '백두산', '이', '마르고', '닳도록']
['동하', '어', '물', '과', '백두산', '이', '마르', '고', '닳', '도록']
['동해물', '과', '백두산', '이', '마르', '고', '닳', '도록']
['동해물과 백두산이', '마르고', '닳', '도록']


In [7]:
str1 = '하느님이 보우하사 우리나라 만세'

print(tagger1.morphs(str1))
print(tagger2.morphs(str1))
print(tagger3.morphs(str1))
print(tagger4.morphs(str1))

['하느님', '이', '보우', '하사', '우리나라', '만세']
['하느님', '이', '보우', '하사', '우리나라', '만세']
['하느님', '이', '보우하사', '우리나라', '만세']
['하느님', '이', '보우', '하사', '우리나라', '만세']
