In [None]:
%matplotlib inline

In [None]:
# 単変量LSTMでsin波予測 (チュートリアルとして規則性のあるデータを予測)

In [None]:
# データ

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# sin波生成
def sin(length=100, curve=4):
    x = np.arange(0, length)
    return np.sin((curve * np.pi) * x / length)

In [None]:
plt.plot(sin())
plt.show()

In [None]:
# ノイズ付きsin波
def noised_sin(length=100, curve=4, noise_rate=0.1, low=-1.0, high=1.0):
    x = sin(length=length, curve=curve)
    noise = noise_rate * np.random.uniform(low=low, high=high, size=len(x))
    return x + noise

In [None]:
plt.plot(noised_sin())
plt.show()

In [None]:
# 学習用時系列データとして整形
def create_reccurent_dataset(vector_data, step=25):
    data, target = [], []
    for i in range(len(vector_data) - step):
        data.append(vector_data[i:(i + step)])
        target.append(vector_data[i + step])
    reshape_data = np.array(data).reshape(len(data), step, 1)
    reshape_label = np.array(target).reshape(len(data), 1)
    return reshape_data, reshape_label

In [None]:
# 形状と内容の確認
train_data, train_label = create_reccurent_dataset([1,2,3,4,5,6,7,8,9], 3)

In [None]:
# 学習データ、stepで区切った時系列データ、3次元配列
# なぜ3次元なのか = 他にも気温、湿度、曜日など予測に役立ちそうなデータを入れられる。
# 今回は1個のデータのみ
# (学習データ総数, 1学習の時系列数(ステップ数), 特徴量)
print(train_data.shape)
print(train_data)

In [None]:
# 答えラベル、次に来るべき値、2次元配列
print(train_label.shape)
print(train_label)

In [None]:
# ノイズ付きsin波予測
data = noised_sin(length=100, curve=4, noise_rate=0.1)
train_data, train_label = create_reccurent_dataset(data, step=25)

In [None]:
train_data.shape

In [None]:
train_label.shape

In [None]:
# モデル構築

In [None]:
# 時系列(長短期記憶)学習モデルの種類
# RNN, LSTM, [QRNN](https://qiita.com/icoxfog417/items/d77912e10a7c60ae680e)...他
# LSTM
#   LSTM(1D) 例: 1次元の時系列データを入力として利用する
#   ConvLSTM2D 例: 動画を時系列データを入力として利用する
#   ConvLSTM3D 例: 空間の変化を時系列データを入力として利用する

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Activation, LSTM
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping

In [None]:
step_count = 25 # 1つの学習データのStep数(今回は25)
feature_count = 1 # 特徴量 今回の学習するデータは1個のみ
hidden_unit_count = 300 # 中間層の数だと思う (公式は:出力の次元数?) https://keras.io/ja/layers/recurrent/#lstm

In [None]:
model = Sequential()
model.add(LSTM(hidden_unit_count, batch_input_shape=(None, step_count, feature_count), return_sequences=False))
model.add(Dense(1))
model.add(Activation('linear'))
optimizer = Adam(lr=0.001)
model.compile(loss='mean_squared_error', optimizer=optimizer)

In [None]:
model.output_shape

In [None]:
# 早期終了 過学習抑制、打ち切る手法 過学習(Over fitting)に陥ることを避けつつ、簡単にエポック数を考える手間を減らすことができる
# 30~50epochsくらいで十分学習できたと判断して終了するはず
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=20)

In [None]:
# トレーニング
model.fit(
    train_data, train_label,
    batch_size=300,
    epochs=100,
    validation_split=0.1,
    callbacks=[early_stopping]
)

In [None]:
# 推論、検証
predictions = model.predict(train_data)

In [None]:
plt.figure()
plt.plot(range(0, len(predictions)), predictions, color='r', label='predictions')
plt.plot(range(0, len(train_label)), train_label, color='b', label='training_data')
plt.legend()
plt.show()

In [None]:
# 未来予測
latest_data = train_data[len(train_data)-1]
results = np.empty((0, 0))
for _ in range(50):
    # 推論
    test_data = np.reshape(latest_data, (1, 25, 1))
    batch_predict = model.predict(test_data)

    # 結果蓄積
    results = np.append(results, batch_predict)

    # 次のデータをセット
    latest_data = np.delete(latest_data, 0)
    latest_data = np.append(latest_data, batch_predict)

# (50)から(50,1)に形状を変える
results = np.reshape(results, (results.shape[0], 1))

In [None]:
plt.figure()
plt.plot(range(0, len(predictions)), predictions, color='r', label='predictions')
plt.plot(range(0, len(train_label)), train_label, color='b', label='training_data')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results, color='g', label='future_predictions')
plt.legend()
plt.show()

In [None]:
# 多変量LSTMでアイスクリームの売り上げ予測 (ある程度規則性のあるデータで、売上だけではなく気温なども考慮した予測)
# 参考データ(使いやすいように加工してます): https://oku.edu.mie-u.ac.jp/~okumura/stat/160118.html

import pandas

In [None]:
data = pandas.read_csv('volume/datasets/tokyo-weather-and-ice-sales.csv')
data

In [None]:
# year, month, avg_max_temperature, sum_precipitation_mm, avg_humidity_per, 25c_days, ice_sale
plt.plot(data['avg_max_temperature'], label='avg_max_temperature') # 最大気温平均
plt.plot(data['sum_precipitation_mm'], label='sum_precipitation_mm') # 合計降水量
plt.plot(data['avg_humidity_per'], label='avg_humidity_per') # 平均湿度
plt.plot(data['25c_days'], label='25c_days') # 25度以上の日数
plt.plot(data['ice_sale'], label='ice_sale') # アイスの売り上げ(円)
plt.legend()
plt.show()

In [None]:
# 最大気温平均, 合計降水量, 平均湿度, 25度以上の日数をもとに`アイスの売り上げ(円)`を予測します

temps = data['avg_max_temperature'].values.tolist() # 配列化
temps = np.array(temps).reshape((len(temps), 1)) # 形状変更 (120, 1)
print(temps.shape)

precs = data['sum_precipitation_mm'].values.tolist() # 配列化
precs = np.array(precs).reshape((len(precs), 1)) # 形状変更 (120, 1)
print(precs.shape)

humidities = data['avg_humidity_per'].values.tolist() # 配列化
humidities = np.array(humidities).reshape((len(humidities), 1)) # 形状変更 (120, 1)
print(humidities.shape)

up25days = data['25c_days'].values.tolist() # 配列化
up25days = np.array(up25days).reshape((len(up25days), 1)) # 形状変更 (120, 1)
print(up25days.shape)

icesales = data['ice_sale'].values.tolist() # 配列化
icesales = np.array(icesales).reshape((len(icesales), 1)) # 形状変更 (120, 1)
print(icesales.shape)

In [None]:
# 標準化
# すべての値を0~1の範囲に納める
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

normalized_temps = scaler.fit_transform(temps)
normalized_precs = scaler.fit_transform(precs)
normalized_humidities = scaler.fit_transform(humidities)
normalized_up25days = scaler.fit_transform(up25days)
normalized_icesales = scaler.fit_transform(icesales)

In [None]:
normalized_icesales[0:10]

In [None]:
# 一年周期で変動する月のデータなので12step
# 余談、ちょくちょくデータ内容確認した方がいい。normalized_icesalesをicesalesとtypoしてicesalesだけ300倍率くらいのデータになって学習時にハマる。。
temps_train_data, temps_train_label = create_reccurent_dataset(normalized_temps, step=12)
precs_train_data, precs_train_label = create_reccurent_dataset(normalized_precs, step=12)
up25days_train_data, up25days_train_label = create_reccurent_dataset(normalized_up25days, step=12)
humidities_train_data, humidities_train_label = create_reccurent_dataset(normalized_humidities, step=12)
icesales_train_data, icesales_train_label = create_reccurent_dataset(normalized_icesales, step=12)
print(temps_train_data.shape)
print(temps_train_label.shape)
print(precs_train_data.shape)
print(precs_train_label.shape)
print(up25days_train_data.shape)
print(up25days_train_label.shape)
print(humidities_train_data.shape)
print(humidities_train_label.shape)
print(icesales_train_data.shape)
print(icesales_train_label.shape)

In [None]:
# 推論する際の形状 = (個数, ステップ数, 特徴量(最大気温平均, 合計降水量, 平均湿度, 25度以上の日数))
# 今回の推論時の形状 = (個数, 12, 5)
# 入力も出力も最大気温平均, 合計降水量, 平均湿度, 25度以上の日数, アイスの売り上げ(円)の5つ
train_data = np.c_[temps_train_data, precs_train_data, up25days_train_data, humidities_train_data, icesales_train_data]
train_label = np.c_[temps_train_label, precs_train_label, up25days_train_label, humidities_train_label, icesales_train_label]
print(train_data.shape)
print(train_label.shape)

# train_data[0:5]
# train_label[0:5]

In [None]:
# モデル構築
step_count = 12
feature_count = 5
hidden_unit_count = 120

model = Sequential()
model.add(LSTM(hidden_unit_count, batch_input_shape=(None, step_count, feature_count)))
model.add(Dense(feature_count))
model.add(Activation('linear'))
optimizer = Adam(lr=0.003)
model.compile(loss='mean_squared_error', optimizer=optimizer)
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=20)

In [None]:
# トレーニング
model.fit(
    train_data, train_label,
    batch_size=2000,
    epochs=200,
    validation_split=0.1,
    callbacks=[early_stopping]
)

In [None]:
# 推論、検証
predictions = model.predict(train_data)
predictions[0:5]

In [None]:
# 最大気温平均
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,0], color=(1.0, 0.7, 0.7), label='train_label temp')
plt.plot(range(0, len(predictions)), predictions[:,0], color=(1.0, 0.0, 0.0), label='predictions temp')
plt.legend()
plt.show()

In [None]:
# 合計降水量
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,1], color=(0.7, 0.7, 1.0), label='train_label prec')
plt.plot(range(0, len(predictions)), predictions[:,1], color=(0.0, 0.0, 1.0), label='predictions prec')
plt.legend()
plt.show()

In [None]:
# 平均湿度
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,2], color=(0.7, 1.0, 1.0), label='train_label humidity')
plt.plot(range(0, len(predictions)), predictions[:,2], color=(0.0, 1.0, 1.0), label='predictions humidity')
plt.legend()
plt.show()

In [None]:
# 25度以上の日数
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,3], color=(1.0, 1.0, 0.7), label='train_label 25c_days')
plt.plot(range(0, len(predictions)), predictions[:,3], color=(1.0, 1.0, 0.0), label='predictions 25c_days')
plt.legend()
plt.show()

In [None]:
# アイスの売り上げ
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,4], color=(0.7, 1.0, 1.0), label='train_label ice_sale')
plt.plot(range(0, len(predictions)), predictions[:,4], color=(0.0, 1.0, 1.0), label='predictions ice_sale')
plt.legend()
plt.show()

In [None]:
# すべて
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,0], color=(1.0, 0.7, 0.7), label='train_label temp')
plt.plot(range(0, len(predictions)), predictions[:,0], color=(1.0, 0.0, 0.0), label='predictions temp')
plt.plot(range(0, len(train_label)), train_label[:,1], color=(0.7, 0.7, 1.0), label='train_label prec')
plt.plot(range(0, len(predictions)), predictions[:,1], color=(0.0, 0.0, 1.0), label='predictions prec')
plt.plot(range(0, len(train_label)), train_label[:,2], color=(0.7, 1.0, 1.0), label='train_label humidity')
plt.plot(range(0, len(predictions)), predictions[:,2], color=(0.0, 1.0, 1.0), label='predictions humidity')
plt.plot(range(0, len(train_label)), train_label[:,3], color=(1.0, 1.0, 0.7), label='train_label 25c_days')
plt.plot(range(0, len(predictions)), predictions[:,3], color=(1.0, 1.0, 0.0), label='predictions 25c_days')
plt.plot(range(0, len(train_label)), train_label[:,4], color=(0.7, 1.0, 1.0), label='train_label ice_sale')
plt.plot(range(0, len(predictions)), predictions[:,4], color=(0.0, 1.0, 1.0), label='predictions ice_sale')
plt.legend()
plt.show()

In [None]:
# 未来予測 5ヶ月後
latest_data = np.array([train_data[len(train_data)-1]])
results = []
for _ in range(5):
    # 推論
    print(latest_data.shape)
    batch_predict = model.predict(latest_data)
    # 結果蓄積
    results.append([batch_predict[0][0], batch_predict[0][1], batch_predict[0][2], batch_predict[0][3], batch_predict[0][4]])
    # 次のデータをセット
    latest_data = np.delete(latest_data, np.array([batch_predict]), axis=1)
    latest_data = np.append(latest_data, np.array([batch_predict]), axis=1)
results = np.array(results)
results.shape

In [None]:
plt.figure()

plt.plot(range(0, len(train_label)), train_label[:,0], color=(1.0, 0.7, 0.7), label='train_label temp')
plt.plot(range(0, len(train_label)), train_label[:,1], color=(0.7, 0.7, 1.0), label='train_label prec')
plt.plot(range(0, len(train_label)), train_label[:,2], color=(0.7, 1.0, 1.0), label='train_label humidity ')
plt.plot(range(0, len(train_label)), train_label[:,3], color=(1.0, 1.0, 0.7), label='train_label 25c_days')
plt.plot(range(0, len(train_label)), train_label[:,4], color=(0.7, 1.0, 1.0), label='train_label ice_sale')

plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,0], color=(1.0, 0.0, 0.0), label='future temp')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,1], color=(0.0, 0.0, 1.0), label='future prec')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,2], color=(0.0, 1.0, 1.0), label='future humidity')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,3], color=(1.0, 1.0, 0.0), label='future 25c_days')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,4], color=(0.0, 1.0, 1.0), label='future ice_sales')

plt.legend()
plt.show()

In [None]:
# pybitflyer データ取得

In [None]:
import pybitflyer

In [None]:
api = pybitflyer.API()
data = api.ticker(product_code='BTC_JPY')

In [None]:
data

In [None]:
print(data['ltp']) # 最終取引価格
print(data['best_ask']) # 最高買い価格
print(data['best_bid']) # 最小売り価格

In [None]:
# 単変量のLSTM
# まずは最終取引価格だけで最終取引価格を予測

In [None]:
import time
from datetime import datetime

In [None]:
btc_jpy_data = []
count = 5
api = pybitflyer.API()
while True:
    tick = api.ticker(product_code='BTC_JPY')
    print('tick={} ltp={}'.format(len(btc_jpy_data), tick['ltp']))
    btc_jpy_data.append(tick)
    time.sleep(1)
    if count <= len(btc_jpy_data):
        break

In [None]:
btc_jpy_data

In [None]:
# ファイルから
import json
with open('volume/datasets/BTC_JPY.json', 'r') as f:
    btc_jpy_data = json.load(f)

In [None]:
btc_jpy_data

In [None]:
# ltpだけ利用
data = [[d['ltp']] for d in btc_jpy_data]

In [None]:
data

In [None]:
plt.figure()
plt.plot(range(0, len(data)), data, color='r', label='data')
plt.legend()
plt.show()

In [None]:
# 標準化
# すべての値を0~1の範囲に納める
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
normalized_data = scaler.fit_transform(data)
print(normalized_data)

In [None]:
train_data, train_label = create_reccurent_dataset(normalized_data, step=5)
print(train_data.shape)
print(train_label.shape)

In [None]:
# モデル構築
# 学習率の調整がキモ?? 0.00001だと200stepでも不安定、0.0001なら50step程度でEarlyStoppingする
step_count = 5
feature_count = 1
hidden_unit_count = 300
model = Sequential()
model.add(LSTM(hidden_unit_count, batch_input_shape=(None, step_count, feature_count), return_sequences=False))
model.add(Dense(feature_count))
model.add(Activation('linear'))
optimizer = Adam(lr=0.0001) # 学習率 0.001 -> 0.003 -> 0.03 -> 0.01
model.compile(loss='mean_squared_error', optimizer=optimizer)
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=0)

In [None]:
# トレーニング
model.fit(
    train_data, train_label,
    batch_size=2000,
    epochs=200,
    validation_split=0.1,
    callbacks=[early_stopping]
)

In [None]:
model.predict(np.array([[[1.0],[0.8],[0.6],[0.4],[0.2]]]))[0][0]

In [None]:
model.predict(np.array([[[0.0],[0.0],[0.0],[0.0],[0.0]]]))[0][0]

In [None]:
model.predict(np.array([[[1.0],[1.0],[1.0],[1.0],[1.0]]]))[0][0]

In [None]:
model.predict(np.array([[[0.2],[0.4],[0.6],[0.8],[1.0]]]))[0][0]

In [None]:
# 推論、検証
predictions = model.predict(train_data)

In [None]:
plt.figure()
plt.plot(range(0, len(predictions)), predictions, color='r', label='predictions')
plt.plot(range(0, len(train_label)), train_label, color='b', label='train_label')
plt.legend()
plt.show()

In [None]:
# 未来予測
latest_data = train_data[len(train_data)-1]
results = np.empty((0, 0))
for _ in range(50):
    # 推論
    test_data = np.reshape(latest_data, (1, 5, 1))
    batch_predict = model.predict(test_data)
    # 結果蓄積
    results = np.append(results, batch_predict)
    # 次のデータをセット
    latest_data = np.delete(latest_data, 0)
    latest_data = np.append(latest_data, batch_predict)

# (個数)から(個数,1)に形状を変える
results = np.reshape(results, (results.shape[0], 1))

In [None]:
plt.figure()
plt.plot(range(0, len(predictions)), predictions, color='r', label='predictions')
plt.plot(range(0, len(train_label)), train_label, color='b', label='train_label')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results, color='g', label='future_predictions')
plt.legend()
plt.show()

In [None]:
# 多変量のLSTM
# 利用する特徴 ltp, best_ask, best_bid

In [None]:
ltp_data = np.array([[d['ltp']] for d in btc_jpy_data])
best_ask_data = np.array([[d['best_ask']] for d in btc_jpy_data])
best_bid_data = np.array([[d['best_bid']] for d in btc_jpy_data])

In [None]:
# 標準化
scaler = MinMaxScaler(feature_range=(0, 1))
ltp_data = scaler.fit_transform(ltp_data)
best_ask_data = scaler.fit_transform(best_ask_data)
best_bid_data = scaler.fit_transform(best_bid_data)

In [None]:
plt.figure()
plt.plot(range(0, len(ltp_data)), ltp_data, color='r', label='ltp')
plt.plot(range(0, len(best_ask_data)), best_ask_data, color='g', label='best_ask')
plt.plot(range(0, len(best_bid_data)), best_bid_data, color='b', label='best_bid')
plt.legend()
plt.show()

In [None]:
ltp_train_data, ltp_train_label = create_reccurent_dataset(ltp_data, step=5)
best_ask_train_data, best_ask_train_label = create_reccurent_dataset(best_ask_data, step=5)
best_bid_train_data, best_bid_train_label = create_reccurent_dataset(best_bid_data, step=5)

In [None]:
print(ltp_train_data.shape)
print(best_ask_train_data.shape)
print(best_bid_train_data.shape)

In [None]:
print(ltp_train_label.shape)
print(best_ask_train_label.shape)
print(best_bid_train_label.shape)

In [None]:
# 推論する際の形状 = (個数, ステップ数, 特徴量)
# 前回の推論時の形状 = (個数, 5, 1)
# 今回の推論時の形状 = (個数, 5, 3)
# 入力も出力もltp, best_ask, best_bidの3つ
train_data = np.c_[ltp_train_data, best_ask_train_data, best_bid_train_data]
train_label = np.c_[ltp_train_label, best_ask_train_label, best_bid_train_label]
print(train_data.shape)
print(train_label.shape)

In [None]:
# モデル構築
step_count = 5
feature_count = 3
hidden_unit_count = 300

model = Sequential()
model.add(LSTM(hidden_unit_count, batch_input_shape=(None, step_count, feature_count), return_sequences=False))
model.add(Dense(feature_count))
model.add(Activation('linear'))
optimizer = Adam(lr=0.001)
model.compile(loss='mean_squared_error', optimizer=optimizer)
early_stopping = EarlyStopping(monitor='val_loss', mode='auto', patience=20)

In [None]:
# トレーニング
model.fit(
    train_data, train_label,
    batch_size=2000,
    epochs=200,
    validation_split=0.1,
    callbacks=[early_stopping]
)

In [None]:
# 推論、検証
predictions = model.predict(train_data)

In [None]:
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,0], color=(1.0, 0.7, 0.7), label='train_label ltp')
plt.plot(range(0, len(train_label)), train_label[:,1], color=(0.7, 1.0, 0.7), label='train_label best_ask')
plt.plot(range(0, len(train_label)), train_label[:,2], color=(0.7, 0.7, 1.0), label='train_label best_bid')
plt.plot(range(0, len(predictions)), predictions[:,0], color=(1.0, 0.0, 0.0), label='predictions ltp')
plt.plot(range(0, len(predictions)), predictions[:,1], color=(0.0, 1.0, 0.0), label='predictions best_ask')
plt.plot(range(0, len(predictions)), predictions[:,2], color=(0.0, 0.0, 1.0), label='predictions best_bid')
plt.legend()
plt.show()

In [None]:
latest_data = np.array([train_data[len(train_data)-1]])
results = []
for _ in range(50):
    # 推論
    batch_predict = model.predict(latest_data)
    # 結果蓄積
    results.append([batch_predict[0][0], batch_predict[0][1], batch_predict[0][2]])
    # 次のデータをセット
    latest_data = np.delete(latest_data, np.array([batch_predict]), axis=1)
    latest_data = np.append(latest_data, np.array([batch_predict]), axis=1)
results = np.array(results)
results.shape

In [None]:
plt.figure()
plt.plot(range(0, len(train_label)), train_label[:,0], color=(1.0, 0.7, 0.7), label='train_label ltp')
plt.plot(range(0, len(train_label)), train_label[:,1], color=(0.7, 1.0, 0.7), label='train_label best_ask')
plt.plot(range(0, len(train_label)), train_label[:,2], color=(0.7, 0.7, 1.0), label='train_label best_bid')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,0], color=(1.0, 0.0, 0.0), label='future ltp')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,1], color=(0.0, 1.0, 0.0), label='future best_ask')
plt.plot(range(len(train_label)-1, len(train_label)-1+len(results)), results[:,2], color=(0.0, 0.0, 1.0), label='future best_bid')
plt.legend()
plt.show()

In [None]:
# TODO: リアルタイム予測 and 学習
# 特徴量を増やすと良いかもしれない 最大、最小、ローソク足の最大最小、関連ニュース数(良し悪し別)、曜日

In [None]:
# TODO: RNN  word2vec

In [None]:
# 参考
# https://deepage.net/bigdata/machine_learning/2016/09/02/word2vec_power_of_word_vector.html
# https://qiita.com/Hironsan/items/513b9f93752ecee9e670
# https://qiita.com/Hironsan/items/11b388575a058dc8a46a
# https://qiita.com/kenta1984/items/93b64768494f971edf86

In [None]:
import gensim

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('volume/models/word2vec.vec', binary=False)

In [None]:
# model == model.wv # ただのエイリアスっぽい
model['ピザ'].shape

In [None]:
model['ピザ']

In [None]:
# 類似語と類似度
model.most_similar(positive=['ピザ']) # or model.similar_by_word('ピザ')

In [None]:
model.most_similar(negative=['ピザ'])

In [None]:
# 類似度
print(model.similarity('ピザ', 'ハンバーガー'))
print(model.similarity('ピザ', 'パン'))
print(model.similarity('ピザ', 'スプーン'))
print(model.similarity('ピザ', 'イス'))

In [None]:
# ベクトルから類似語と類似度
vector = model['ピザ']
model.similar_by_vector(vector)

In [None]:
vector = model['王'] - model['男'] + model['女']
model.similar_by_vector(vector)