In [None]:
from scipy import stats

In [None]:
X = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
rosen_ID = X['路線ID']
y = X['飲食店事業所数']
X = X.drop(['小売事業所数', '小売業年間商品販売額', '全産業事業所数', '生徒学生数', '飲食店事業所数',
            '買回り品(事業所数比率)', '最寄り品(事業所数比率)', '全産業従業者総数', '路線ID'], axis=1)

X.to_csv('./variables/simple.csv')
y.to_csv('./variables/target.csv')

In [None]:
X['人口総数'] = X['人口総数']+1
X['世帯数'] = X['世帯数']+1

In [None]:
X['乗降客数（日）/人口総数'] = X['乗降客数（日）'] / X['人口総数']
X['男性人口/人口総数'] = X['男性人口'] / X['人口総数']
X['0～14歳人口/人口総数'] = X['0～14歳人口'] / X['人口総数']
X['15～64歳人口/人口総数'] = X['15～64歳人口'] / X['人口総数']
X['65歳以上人口/人口総数'] = X['65歳以上人口'] / X['人口総数']
X['昼間人口/人口総数'] = X['昼間人口'] / X['人口総数']
X['人口総数/世帯数'] = X['人口総数'] / X['世帯数']
X['1人世帯数/世帯数'] = X['1人世帯数'] / X['世帯数']

X.to_csv('./variables/devision.csv')

In [None]:
# Copy the data for plotting
plot_data = pd.concat([X['昼間人口'], X['乗降客数（日）'], X['1人世帯数'], y], axis=1)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False)

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r)

In [None]:
# Copy the data for plotting
plot_data = pd.concat([X, y], axis=1)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False)

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r)

In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = pd.concat([X, y], axis=1).corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, square=True, cmap='RdBu_r')
#sns.heatmap(corrmat, square=True, cmap='Blues')

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree=2)
poly_transformer.fit(X)
poly = poly_transformer.transform(X)
X = pd.DataFrame(poly, columns=poly_transformer.get_feature_names(X.columns))
X = X.drop('1', axis=1)
del poly

X.to_csv('./variables/dev_polynomial.csv')

In [None]:
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

In [None]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=123)
tsvd_results = tsvd.fit_transform(X)

# PCA
pca = PCA(n_components=n_comp, random_state=123)
pca_results = pca.fit_transform(X)

# ICA
ica = FastICA(n_components=n_comp, random_state=123)
ica_results = ica.fit_transform(X)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=123)
grp_results = grp.fit_transform(X)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=123)
srp_results = srp.fit_transform(X)

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    X['pca_' + str(i)] = pca_results[:, i - 1]
    X['ica_' + str(i)] = ica_results[:, i - 1]
    X['tsvd_' + str(i)] = tsvd_results[:, i - 1]
    X['grp_' + str(i)] = grp_results[:, i - 1]
    X['srp_' + str(i)] = srp_results[:, i - 1]

X.to_csv('./variables/dev_poly_dimreduction.csv')

In [None]:
X

In [None]:
y.index = range(len(y))

In [None]:
corrmat = pd.concat([X, y], axis=1).corr()

In [None]:
corrmat['飲食店事業所数'].sort_values().head(10)

In [None]:
corrmat['飲食店事業所数'].sort_values().tail(10)

# 特徴量選択

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(random_state=1234)

In [None]:
lgb.fit(X_train, y_train)

In [None]:
 def plot_feature_importances(df, n_feat):
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, n_feat//3))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:n_feat]))), 
            df['importance_normalized'].head(n_feat),
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:n_feat]))))
    ax.set_yticklabels(df['feature'].head(n_feat))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    

n_feat = len(X_test.columns)
df = pd.DataFrame([X_test.columns, lgb.feature_importances_], index=['feature', 'importance'])
df = df.T
df['importance_normalized'] = df['importance'] / df['importance'].sum()
plot_feature_importances(df, n_feat)

In [None]:
pred = lgb.predict(X_test)

In [None]:
np.sqrt(np.mean((y_test.values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y_test, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
def plot_actual_predicted(actual, predicted):
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.show()
    del tmp

plot_actual_predicted(y_test, pred)

In [None]:
X = X[list(df[df['importance_normalized'] > 0.005]['feature'])]

X.to_csv('./variables/dev_poly_dim_featureimportance.csv')

In [None]:
X['rosenID'] = rosen_ID.values

In [None]:
X = pd.get_dummies(X)

X.to_csv('./variables/dev_poly_dim_imp_rosenID.csv')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(random_state=1234)

In [None]:
lgb.fit(X_train, y_train)

In [None]:
pred = lgb.predict(X_test)

In [None]:
np.sqrt(np.mean((y_test.values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y_test, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
def plot_actual_predicted(actual, predicted):
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.show()
    del tmp

plot_actual_predicted(y_test, pred)

# 乗降客数（日）

In [None]:
sns.distplot(X['乗降客数（日）'])

In [None]:
stats.probplot(X['乗降客数（日）'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.distplot(np.log1p(X['乗降客数（日）']))

In [None]:
stats.probplot(np.log1p(X['乗降客数（日）']), dist="norm", plot=plt)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(X["乗降客数（日）"], X["飲食店事業所数"], 'o')

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(np.log1p(X["乗降客数（日）"]), np.log1p(X["飲食店事業所数"]), 'o')

# 人口総数

In [None]:
sns.distplot(X['人口総数'])

In [None]:
stats.probplot(X['人口総数'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.distplot(np.log1p(X['人口総数']))

In [None]:
stats.probplot(np.log1p(X['人口総数']), dist="norm", plot=plt)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(X["人口総数"], X["飲食店事業所数"], 'o')

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(np.log1p(X["人口総数"]), np.log1p(X["飲食店事業所数"]), 'o')

# 昼間人口

In [None]:
sns.distplot(X['昼間人口'])

In [None]:
stats.probplot(X['昼間人口'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.distplot(np.log1p(X['昼間人口']))

In [None]:
stats.probplot(np.log1p(X['昼間人口']), dist="norm", plot=plt)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(X["昼間人口"], X["飲食店事業所数"], 'o')

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(np.log1p(X["昼間人口"]), np.log1p(X["飲食店事業所数"]), 'o')

# 1人世帯数

In [None]:
from scipy import stats

In [None]:
sns.distplot(X['1人世帯数'])

In [None]:
stats.probplot(X['1人世帯数'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.distplot(np.log1p(X['1人世帯数']))

In [None]:
stats.probplot(np.log1p(X['1人世帯数']), dist="norm", plot=plt)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(X["1人世帯数"], X["飲食店事業所数"], 'o')

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(np.log1p(X["1人世帯数"]), np.log1p(X["飲食店事業所数"]), 'o')

# 飲食店事業所数

In [None]:
sns.distplot(X['飲食店事業所数'])

In [None]:
stats.probplot(X['飲食店事業所数'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.distplot(np.log1p(X['飲食店事業所数']))

In [None]:
stats.probplot(np.log1p(X['飲食店事業所数']), dist="norm", plot=plt)
plt.show()