### スクレイピングで取得する項目は以下とする

<ul>
    <li>評価</li>
    <li>ホテル名</li>
    <li>投稿内容</li>
    <li>目的</li>
    <li>同伴者</li>
    <li>Date</li>
</ul>

In [75]:
import datetime
import numpy as np
import pandas as pd
import re
import requests
import sys
import time

from bs4 import BeautifulSoup
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from nlp_modules import text_to_word

### モデル構築のためのCSV準備

コマンドプロンプトで
scraping.pyを起動することでsample_csvフォルダに各ホテルが生成される

（エリアURLを指定することでできるが、おそらくいくつかのCSV書き出し後にブロックされる。。）

### データ分析

In [78]:
data1 = pd.read_csv('./sample_csv/151064_reputation.csv')
data2 = pd.read_csv('./sample_csv/179617_reputation.csv')

In [79]:
data1 = data1.drop('Unnamed: 0', axis=1)
data2 = data2.drop('Unnamed: 0', axis=1)

In [80]:
data = pd.concat([data1, data2]).reset_index(drop=True)
data['Date'] = pd.to_datetime(data['Date'], format='%Y年%m月')
data.shape

(593, 6)

In [81]:
# 前処理
data['Comments'] = [word.strip() for word in data['Comments']]
data['Comment_words'] = data['Comments'].apply(lambda x: text_to_word(x))

In [53]:
data_copy = data.copy()
X = data_copy[['Comment_words', 'Purposes', 'companions']]
X = pd.get_dummies(X, columns=['Purposes', 'companions'], drop_first=True)
y = data_copy['Reputations']

In [54]:
tfidf = TfidfVectorizer(min_df=4, max_df=.7)
x = tfidf.fit_transform(X['Comment_words'])
X_sparse = X.drop('Comment_words', axis=1).astype(pd.SparseDtype("int", np.nan))

In [55]:
X = hstack((x, X_sparse))

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='macro'))

0.5714285714285714
0.17315837339223075


In [76]:
param_dst = {'C': np.arange(0.01, 10, 0.3), 'kernel': ['rbf', 'poly'], 'gamma': [0.01, 0.1]}

RS = RandomizedSearchCV(estimator=SVC(), param_distributions=param_dst, n_iter=50, cv=10, random_state=123)
RS.fit(X_train, y_train)
y_pred2 = RS.predict(X_test)

print('f1', f1_score(y_true=y_test, y_pred=y_pred, average='macro'))
print('acc', accuracy_score(y_true=y_test, y_pred=y_pred2))



f1 0.17315837339223075
acc 0.6134453781512605


In [77]:
RS.best_params_

{'kernel': 'rbf', 'gamma': 0.1, 'C': 4.51}