<a href="https://colab.research.google.com/github/YooNayoung/ESAA/blob/main/8_7%268_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **07.문서 군집화 소개와 실습(Opinion Review 데이터 세트)**

## **문서 군집화 개념**
- 비슷한 텍스트 구성의 문서를 군집화하는 것
- 텍스트 분류 기반의 문서 분류와 유사하나 학습 데이터 세트가 필요없는 비지도학습 기반으로 동작

## **Opinion Review 데이터 세트를 이용한 문서 군집화 수행하기** 


In [1]:
import pandas as pd
import glob ,os

path = r'/content/drive/MyDrive/ESAA(OB)/topics'                     
# path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.data"))    
filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합, 
# 개별 파일들의 파일내용은 DataFrame로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합 
for file_ in all_files:
    # 개별 파일을 읽어서 DataFrame으로 생성 
    df = pd.read_table(file_,index_col=None, header=0,encoding='latin1')
    
    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경. 맨 마지막 .data 확장자도 제거
    filename_ = file_.split('\\')[-1]
    filename = filename_.split('.')[0]

    #파일명 리스트와 파일내용 리스트에 파일명과 파일 내용을 추가. 
    filename_list.append(filename)
    opinion_text.append(df.to_string())

# 파일명 리스트와 파일내용 리스트를  DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,/content/drive/MyDrive/ESAA(OB)/topics/food_sw...,...
1,/content/drive/MyDrive/ESAA(OB)/topics/gas_mil...,...
2,/content/drive/MyDrive/ESAA(OB)/topics/bathroo...,...
3,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...
4,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...


TF-IDF 피처 벡터화

In [2]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [5]:
nltk.download('punkt'); nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english' , \
                             ngram_range=(1,2), min_df=0.05, max_df=0.85 )

#opinion_text 컬럼값으로 feature vectorization 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


  % sorted(inconsistent)


군집화 수행 (K-평균)

In [6]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0 
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [7]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA(OB)/topics/food_sw...,...,1
1,/content/drive/MyDrive/ESAA(OB)/topics/gas_mil...,...,2
2,/content/drive/MyDrive/ESAA(OB)/topics/bathroo...,...,1
3,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...,4
4,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...,4


In [8]:
# Cluster #0 : 호텔에 대한 리뷰로 군집화 
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
5,/content/drive/MyDrive/ESAA(OB)/topics/buttons...,...,0
12,/content/drive/MyDrive/ESAA(OB)/topics/eyesigh...,...,0
8,/content/drive/MyDrive/ESAA(OB)/topics/feature...,...,0
23,/content/drive/MyDrive/ESAA(OB)/topics/fonts_a...,...,0
21,/content/drive/MyDrive/ESAA(OB)/topics/keyboar...,...,0
20,/content/drive/MyDrive/ESAA(OB)/topics/navigat...,...,0
29,/content/drive/MyDrive/ESAA(OB)/topics/price_a...,...,0
48,/content/drive/MyDrive/ESAA(OB)/topics/screen_...,...,0
26,/content/drive/MyDrive/ESAA(OB)/topics/screen_...,...,0
42,/content/drive/MyDrive/ESAA(OB)/topics/size_as...,...,0


In [9]:
# Cluster #1 : 킨들, 아이팟, 넷북 등의 포터블 전자기기에 대한 리뷰로 군집화 
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
2,/content/drive/MyDrive/ESAA(OB)/topics/bathroo...,...,1
9,/content/drive/MyDrive/ESAA(OB)/topics/food_ho...,...,1
0,/content/drive/MyDrive/ESAA(OB)/topics/food_sw...,...,1
15,/content/drive/MyDrive/ESAA(OB)/topics/free_be...,...,1
13,/content/drive/MyDrive/ESAA(OB)/topics/locatio...,...,1
19,/content/drive/MyDrive/ESAA(OB)/topics/locatio...,...,1
35,/content/drive/MyDrive/ESAA(OB)/topics/parking...,...,1
38,/content/drive/MyDrive/ESAA(OB)/topics/price_h...,...,1
37,/content/drive/MyDrive/ESAA(OB)/topics/room_ho...,...,1
27,/content/drive/MyDrive/ESAA(OB)/topics/rooms_b...,...,1


In [10]:
# Cluster #2 : 킨들, 아이팟, 넷북이 포함되어 있지만 주로 차량용 네비게이션으로 군집화 
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
18,/content/drive/MyDrive/ESAA(OB)/topics/comfort...,...,2
17,/content/drive/MyDrive/ESAA(OB)/topics/comfort...,...,2
1,/content/drive/MyDrive/ESAA(OB)/topics/gas_mil...,...,2
7,/content/drive/MyDrive/ESAA(OB)/topics/interio...,...,2
14,/content/drive/MyDrive/ESAA(OB)/topics/interio...,...,2
6,/content/drive/MyDrive/ESAA(OB)/topics/mileage...,...,2
45,/content/drive/MyDrive/ESAA(OB)/topics/perform...,...,2
30,/content/drive/MyDrive/ESAA(OB)/topics/quality...,...,2
36,/content/drive/MyDrive/ESAA(OB)/topics/seats_h...,...,2
39,/content/drive/MyDrive/ESAA(OB)/topics/transmi...,...,2


In [11]:
# Cluster #2 : 대부분 호텔에 대한 리뷰로 군집화 
document_df[document_df['cluster_label']==3].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
16,/content/drive/MyDrive/ESAA(OB)/topics/accurac...,...,3
11,/content/drive/MyDrive/ESAA(OB)/topics/directi...,...,3
22,/content/drive/MyDrive/ESAA(OB)/topics/display...,...,3
46,/content/drive/MyDrive/ESAA(OB)/topics/satelli...,...,3
28,/content/drive/MyDrive/ESAA(OB)/topics/screen_...,...,3
32,/content/drive/MyDrive/ESAA(OB)/topics/speed_g...,...,3
25,/content/drive/MyDrive/ESAA(OB)/topics/updates...,...,3
43,/content/drive/MyDrive/ESAA(OB)/topics/voice_g...,...,3


In [12]:
# Cluster #4 : 토요타와 혼다 등의 자동차에 대한 리뷰로 군집화 
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
3,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...,4
10,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...,4
4,/content/drive/MyDrive/ESAA(OB)/topics/battery...,...,4
49,/content/drive/MyDrive/ESAA(OB)/topics/perform...,...,4
33,/content/drive/MyDrive/ESAA(OB)/topics/sound_i...,headphone jack i got a clear case for it a...,4


중심 개수를 5개에서 3개로 낮춰 3개 그룹으로 군집화 

In [13]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화 
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_


# 소속 클러스터를 cluster_label 컬럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA(OB)/topics/food_sw...,...,0
44,/content/drive/MyDrive/ESAA(OB)/topics/service...,...,0
41,/content/drive/MyDrive/ESAA(OB)/topics/service...,...,0
38,/content/drive/MyDrive/ESAA(OB)/topics/price_h...,...,0
37,/content/drive/MyDrive/ESAA(OB)/topics/room_ho...,...,0
35,/content/drive/MyDrive/ESAA(OB)/topics/parking...,...,0
34,/content/drive/MyDrive/ESAA(OB)/topics/service...,...,0
31,/content/drive/MyDrive/ESAA(OB)/topics/staff_s...,...,0
27,/content/drive/MyDrive/ESAA(OB)/topics/rooms_b...,...,0
24,/content/drive/MyDrive/ESAA(OB)/topics/rooms_s...,...,0


## **군집별 핵심 단어 추출하기** 

In [14]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape :',cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (3, 4611)
[[0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.00712192 0.         0.         ... 0.00767704 0.         0.        ]
 [0.00729385 0.00077126 0.         ... 0.         0.         0.        ]]


In [15]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환함. 
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}
    
    # cluster_centers array 의 값이 큰 순으로 정렬된 index 값을 반환
    # 군집 중심점(centroid)별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함.  
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    #개별 군집별로 iteration하면서 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        # 개별 군집별 정보를 담을 데이터 초기화. 
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        # cluster_centers_.argsort()[:,::-1] 로 구한 index 를 이용하여 top n 피처 단어를 구함. 
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]
        
        # top_feature_indexes를 이용해 해당 피처 단어의 중심 위치 상댓값 구함 
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        
        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심위치 상대값, 그리고 해당 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
        
    return cluster_details

In [16]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('####### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('Reviews 파일명 :',cluster_detail['filenames'][:7])
        print('==================================================')

In [17]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df,\
                                  feature_names=feature_names, clusters_num=3, top_n_features=10 )
print_cluster_details(cluster_details)

####### Cluster 0
Top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
Reviews 파일명 : ['/content/drive/MyDrive/ESAA(OB)/topics/food_swissotel_chicago', '/content/drive/MyDrive/ESAA(OB)/topics/bathroom_bestwestern_hotel_sfo', '/content/drive/MyDrive/ESAA(OB)/topics/food_holiday_inn_london', '/content/drive/MyDrive/ESAA(OB)/topics/location_bestwestern_hotel_sfo', '/content/drive/MyDrive/ESAA(OB)/topics/free_bestwestern_hotel_sfo', '/content/drive/MyDrive/ESAA(OB)/topics/location_holiday_inn_london', '/content/drive/MyDrive/ESAA(OB)/topics/rooms_swissotel_chicago']
####### Cluster 1
Top features: ['screen', 'battery', 'keyboard', 'battery life', 'kindle', 'direction', 'life', 'voice', 'video', 'feature']
Reviews 파일명 : ['/content/drive/MyDrive/ESAA(OB)/topics/battery-life_amazon_kindle', '/content/drive/MyDrive/ESAA(OB)/topics/battery-life_netbook_1005ha', '/content/drive/MyDrive/ESAA(OB)/topics/buttons_amazon_kindle', '/content/dr



# **Text Analysis 실습 _ 캐글 Mercari Price Suggestion Challenge**
- train_id: 데이터 id
- name: 제품명
- item_condition_id: 판매자가 제공하는 제품 상태
- category_name: 카테고리 명
- brand_name: 브랜드 이름
- price: 제품 가격, 예측을 위한 타깃 속성
- shipping: 배송비 무료 여부. 1이면 무료, 0이면 유료

## **데이터 전처리** 

In [21]:
!apt-get install p7zip
!p7zip -d -f -k ../content/drive/MyDrive/ESAA(OB)/train.tsv.7z

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip is already the newest version (16.02+dfsg-6).
p7zip set to manually installed.
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.
/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `p7zip -d -f -k ../content/drive/MyDrive/ESAA(OB)/train.tsv.7z'


In [None]:
from sklearn.linear_model import Ridge , LogisticRegression
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
import pandas as pd

mercari_df= pd.read_csv('/content/drive/MyDrive/ESAA(OB)/train.tsv',sep='\t')
print(mercari_df.shape)
mercari_df.head(3)

In [None]:
print(mercari_df.info())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

y_train_df = mercari_df['price']
plt.figure(figsize=(6,4))
sns.distplot(y_train_df,kde=False)

In [None]:
import numpy as np

y_train_df = np.log1p(y_train_df)
sns.distplot(y_train_df,kde=False)

In [None]:
mercari_df['price'] = np.log1p(mercari_df['price'])
mercari_df['price'].head(3)

In [None]:
print('Shipping 값 유형:\n',mercari_df['shipping'].value_counts())
print('item_condition_id 값 유형:\n',mercari_df['item_condition_id'].value_counts())

In [None]:
boolean_cond= mercari_df['item_description']=='No description yet'
mercari_df[boolean_cond]['item_description'].count()

In [None]:
# apply lambda에서 호출되는 대,중,소 분할 함수 생성, 대,중,소 값을 리스트 반환
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Other_Null' , 'Other_Null' , 'Other_Null']

# 위의 split_cat( )을 apply lambda에서 호출하여 대,중,소 컬럼을 mercari_df에 생성. 
mercari_df['cat_dae'], mercari_df['cat_jung'], mercari_df['cat_so'] = \
                        zip(*mercari_df['category_name'].apply(lambda x : split_cat(x)))

# 대분류만 값의 유형과 건수를 살펴보고, 중분류, 소분류는 값의 유형이 많으므로 분류 갯수만 추출
print('대분류 유형 :\n', mercari_df['cat_dae'].value_counts())
print('중분류 갯수 :', mercari_df['cat_jung'].nunique())
print('소분류 갯수 :', mercari_df['cat_so'].nunique())

In [None]:
mercari_df['brand_name'] = mercari_df['brand_name'].fillna(value='Other_Null')
mercari_df['category_name'] = mercari_df['category_name'].fillna(value='Other_Null')
mercari_df['item_description'] = mercari_df['item_description'].fillna(value='Other_Null')

# 각 컬럼별로 Null값 건수 확인. 모두 0가 나와야 합니다.
mercari_df.isnull().sum()

## **피처 인코딩과 피처 벡터화** 

In [None]:
print('brand name 의 유형 건수 :', mercari_df['brand_name'].nunique())
print('brand name sample 5건 : \n', mercari_df['brand_name'].value_counts()[:5])

In [None]:
print('name 의 종류 갯수 :', mercari_df['name'].nunique())
print('name sample 7건 : \n', mercari_df['name'][:7])

In [None]:
pd.set_option('max_colwidth', 200)

# item_description의 평균 문자열 개수
print('item_description 평균 문자열 개수:',mercari_df['item_description'].str.len().mean())

mercari_df['item_description'][:2]

In [None]:
# name 속성에 대한 feature vectorization 변환
cnt_vec = CountVectorizer()
X_name = cnt_vec.fit_transform(mercari_df.name)

# item_description 에 대한 feature vectorization 변환 
tfidf_descp = TfidfVectorizer(max_features = 50000, ngram_range= (1,3) , stop_words='english')
X_descp = tfidf_descp.fit_transform(mercari_df['item_description'])

print('name vectorization shape:',X_name.shape)
print('item_description vectorization shape:',X_descp.shape)

In [None]:
from sklearn.preprocessing import LabelBinarizer

# brand_name, item_condition_id, shipping 각 피처들을 희소 행렬 원-핫 인코딩 변환
lb_brand_name= LabelBinarizer(sparse_output=True)
X_brand = lb_brand_name.fit_transform(mercari_df['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_item_cond_id = lb_item_cond_id.fit_transform(mercari_df['item_condition_id'])

lb_shipping= LabelBinarizer(sparse_output=True)
X_shipping = lb_shipping.fit_transform(mercari_df['shipping'])

# cat_dae, cat_jung, cat_so 각 피처들을 희소 행렬 원-핫 인코딩 변환
lb_cat_dae = LabelBinarizer(sparse_output=True)
X_cat_dae= lb_cat_dae.fit_transform(mercari_df['cat_dae'])

lb_cat_jung = LabelBinarizer(sparse_output=True)
X_cat_jung = lb_cat_jung.fit_transform(mercari_df['cat_jung'])

lb_cat_so = LabelBinarizer(sparse_output=True)
X_cat_so = lb_cat_so.fit_transform(mercari_df['cat_so']

In [None]:
print(type(X_brand), type(X_item_cond_id), type(X_shipping))
print('X_brand_shape:{0}, X_item_cond_id shape:{1}'.format(X_brand.shape, X_item_cond_id.shape))
print('X_shipping shape:{0}, X_cat_dae shape:{1}'.format(X_shipping.shape, X_cat_dae.shape))
print('X_cat_jung shape:{0}, X_cat_so shape:{1}'.format(X_cat_jung.shape, X_cat_so.shape))

In [None]:
from  scipy.sparse import hstack
import gc

sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id,
            X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

# 사이파이 sparse 모듈의 hstack 함수를 이용하여 앞에서 인코딩과 Vectorization을 수행한 데이터 셋을 모두 결합. 
X_features_sparse= hstack(sparse_matrix_list).tocsr()
print(type(X_features_sparse), X_features_sparse.shape)

# 데이터 셋이 메모리를 많이 차지하므로 사용 용도가 끝났으면 바로 메모리에서 삭제. 
del X_features_sparse
gc.collect()

## **릿지 회귀 모델 구축 및 평가**

In [None]:
def rmsle(y , y_pred):
    # underflow, overflow를 막기 위해 log가 아닌 log1p로 rmsle 계산 
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_org_price(y_test , preds): 
    
    # 원본 데이터는 log1p로 변환되었으므로 exmpm1으로 원복 필요. 
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    # rmsle로 RMSLE 값 추출
    rmsle_result = rmsle(y_test_exmpm, preds_exmpm)
    return rmsle_result

In [None]:
import gc 
from  scipy.sparse import hstack

def model_train_predict(model,matrix_list):
    # scipy.sparse 모듈의 hstack 을 이용하여 sparse matrix 결합
    X= hstack(matrix_list).tocsr()     
    
    X_train, X_test, y_train, y_test=train_test_split(X, mercari_df['price'], 
                                                      test_size=0.2, random_state=156)
    
    # 모델 학습 및 예측
    model.fit(X_train , y_train)
    preds = model.predict(X_test)
    
    del X , X_train , X_test , y_train 
    gc.collect()
    
    return preds , y_test

In [None]:
linear_model = Ridge(solver = "lsqr", fit_intercept=False)

sparse_matrix_list = (X_name, X_brand, X_item_cond_id,
                      X_shipping, X_cat_dae, X_cat_jung, X_cat_so)
linear_preds , y_test = model_train_predict(model=linear_model ,matrix_list=sparse_matrix_list)
print('Item Description을 제외했을 때 rmsle 값:', evaluate_org_price(y_test , linear_preds))

sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,
                      X_shipping, X_cat_dae, X_cat_jung, X_cat_so)
linear_preds , y_test = model_train_predict(model=linear_model , matrix_list=sparse_matrix_list)
print('Item Description을 포함한 rmsle 값:',  evaluate_org_price(y_test ,linear_preds))

## **LightGBM 회귀 모델 구축과 앙상블을 이용한 최종 예측 평가**

In [None]:
from lightgbm import LGBMRegressor

sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,
                      X_shipping, X_cat_dae, X_cat_jung, X_cat_so)

lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125, random_state=156)
lgbm_preds , y_test = model_train_predict(model = lgbm_model , matrix_list=sparse_matrix_list)
print('LightGBM rmsle 값:',  evaluate_org_price(y_test , lgbm_preds))

In [None]:
preds = lgbm_preds * 0.45 + linear_preds * 0.55
print('LightGBM과 Ridge를 ensemble한 최종 rmsle 값:',  evaluate_org_price(y_test , preds))