In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = stations.drop(['飲食店事業所数'], axis=1)
y = stations['飲食店事業所数']

In [None]:
X = pd.get_dummies(X)

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
model = lgb.LGBMRegressor()
score = cross_val_score(model, X_train, y_train, cv=5)

In [None]:
score

In [None]:
model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 10))
feature_importance = pd.Series(data=model.feature_importances_, index=X.columns, name='feature_importance')
feature_importance = feature_importance.sort_values(ascending=False)
sns.barplot(x=feature_importance.index[:20], y=feature_importance.values[:20])

In [None]:
predict = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(predict, y_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, predict, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(predict, y_test, edgecolors=(0, 0, 0))
ax.plot([predict.min(), predict.max()], [predict.min(), predict.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

# 路線ID捨てる（正確な予測用）

In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = stations.drop(['飲食店事業所数', '路線ID'], axis=1)
y = stations['飲食店事業所数']

In [None]:
#X

In [None]:
#y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
y_test.iloc[1]

In [None]:
#print(len(X_train),len( X_test), len(y_train), len(y_test))

In [None]:
#多治見、土岐、(瑞浪、)恵那、中津川をtestに移動
X_tono = pd.DataFrame(data=[X_train.loc['g_1189'], X_train.loc['g_1190'], X_train.loc['g_1194'], X_train.loc['g_1196']])
X_test = pd.concat([X_test, X_tono], axis=0)
X_train = X_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)
y_tono = pd.Series(data=[y_train.loc['g_1189'], y_train.loc['g_1190'], y_train.loc['g_1194'], y_train.loc['g_1196']],
                      index=['g_1189', 'g_1190', 'g_1194', 'g_1196'])
y_test = pd.concat([y_test, y_tono], axis=0)
y_train = y_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)

In [None]:
#print(len(X_train),len( X_test), len(y_train), len(y_test))

In [None]:
import lightgbm as lgb
from sklearn.grid_search import GridSearchCV
gbm = lgb.LGBMRegressor()
#param = {'learning_rate':[0.05, 0.1, 0.2], 'max_depth':[-1, 5, 10], 'n_estimators':[100, 300, 1000]}
param = {'learning_rate':[0.2], 'max_depth':[5], 'n_estimators':[1000]}
model = GridSearchCV(gbm, param, cv=5)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.best_params_

In [None]:
predict = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(predict, y_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, predict, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(predict, y_test, edgecolors=(0, 0, 0))
ax.plot([predict.min(), predict.max()], [predict.min(), predict.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
error = pd.Series(data=predict-y_test.values, index=y_test.index)

In [None]:
error

In [None]:
#多治見、土岐、瑞浪、恵那、中津川
print(error['g_1189'], error['g_1190'], error['g_1191'], error['g_1194'], error['g_1196'])
#瑞浪、飲食店の数べつに少なくないやん！（中津川は多過ぎるけど）

# 路線ID、産業指標捨てる（どの説明変数が効いてるかチェック用）

In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = stations.drop(['生徒学生数', '飲食店事業所数', '路線ID', '小売事業所数', '小売業年間商品販売額', '全産業事業所数', '買回り品(事業所数比率)', '最寄り品(事業所数比率)', '全産業従業者総数'], axis=1)
y = stations['飲食店事業所数']

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#多治見、土岐、(瑞浪、)恵那、中津川をtestに移動
X_tono = pd.DataFrame(data=[X_train.loc['g_1189'], X_train.loc['g_1190'], X_train.loc['g_1194'], X_train.loc['g_1196']])
X_test = pd.concat([X_test, X_tono], axis=0)
X_train = X_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)
y_tono = pd.Series(data=[y_train.loc['g_1189'], y_train.loc['g_1190'], y_train.loc['g_1194'], y_train.loc['g_1196']],
                      index=['g_1189', 'g_1190', 'g_1194', 'g_1196'])
y_test = pd.concat([y_test, y_tono], axis=0)
y_train = y_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
model = lgb.LGBMRegressor(learning_rate=0.2, max_depth=5, n_estimators=1000)
score = cross_val_score(model, X_train, y_train, cv=5)

In [None]:
score

In [None]:
model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 10))
feature_importance = pd.Series(data=model.feature_importances_, index=X.columns, name='feature_importance')
feature_importance = feature_importance.sort_values(ascending=False)
sns.barplot(x=feature_importance.index, y=feature_importance.values)

#LGBMのパラメータいじると、順番変わってくる...

In [None]:
predict = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(predict, y_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, predict, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(predict, y_test, edgecolors=(0, 0, 0))
ax.plot([predict.min(), predict.max()], [predict.min(), predict.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
error = pd.Series(data=predict-y_test.values, index=y_test.index)

In [None]:
error

In [None]:
#多治見、土岐、瑞浪、恵那、中津川
print(error['g_1189'], error['g_1190'], error['g_1191'], error['g_1194'], error['g_1196'])
#瑞浪、飲食店やや少ない？

# 路線ID、産業指標捨てる（どの説明変数が効いてるかチェック用）

In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = stations.drop(['生徒学生数', '飲食店事業所数', '路線ID', '小売事業所数', '小売業年間商品販売額', '全産業事業所数', '買回り品(事業所数比率)', '最寄り品(事業所数比率)', '全産業従業者総数', '人口総数' , '男性人口' , '世帯数'], axis=1)
y = stations['飲食店事業所数']

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#多治見、土岐、(瑞浪、)恵那、中津川をtestに移動
X_tono = pd.DataFrame(data=[X_train.loc['g_1189'], X_train.loc['g_1190'], X_train.loc['g_1194'], X_train.loc['g_1196']])
X_test = pd.concat([X_test, X_tono], axis=0)
X_train = X_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)
y_tono = pd.Series(data=[y_train.loc['g_1189'], y_train.loc['g_1190'], y_train.loc['g_1194'], y_train.loc['g_1196']],
                      index=['g_1189', 'g_1190', 'g_1194', 'g_1196'])
y_test = pd.concat([y_test, y_tono], axis=0)
y_train = y_train.drop(['g_1189', 'g_1190', 'g_1194', 'g_1196'], axis=0)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
model = lgb.LGBMRegressor(learning_rate=0.2, max_depth=5, n_estimators=1000)
score = cross_val_score(model, X_train, y_train, cv=5)

In [None]:
score

In [None]:
model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 10))
feature_importance = pd.Series(data=model.feature_importances_, index=X.columns, name='feature_importance')
feature_importance = feature_importance.sort_values(ascending=False)
sns.barplot(x=feature_importance.index, y=feature_importance.values)

#LGBMのパラメータいじると、順番変わってくる...

In [None]:
predict = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(predict, y_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, predict, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(predict, y_test, edgecolors=(0, 0, 0))
ax.plot([predict.min(), predict.max()], [predict.min(), predict.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
error = pd.Series(data=predict-y_test.values, index=y_test.index)

In [None]:
error

In [None]:
#多治見、土岐、瑞浪、恵那、中津川
print(error['g_1189'], error['g_1190'], error['g_1191'], error['g_1194'], error['g_1196'])
#瑞浪、飲食店の数ふつう？

In [None]:
print(y_test['g_1189'], y_test['g_1190'], y_test['g_1191'], y_test['g_1194'], y_test['g_1196'])

In [None]:
#全産業ー飲食店を加える？