In [124]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

In [125]:
ar_df = pd.read_csv('csv/air_reserve.csv')
asi_df = pd.read_csv('csv/air_store_info.csv')
avd_df = pd.read_csv('csv/air_visit_data.csv')
hr_df = pd.read_csv('csv/hpg_reserve.csv')
hsi_df = pd.read_csv('csv/hpg_store_info.csv')
sir_df = pd.read_csv('csv/store_id_relation.csv')
sample_df = pd.read_csv('csv/sample_submission.csv')
di_df = pd.read_csv('csv/date_info.csv')

In [None]:
ar_df.head()

In [None]:
asi_df.head()

In [None]:
avd_df.head()

In [None]:
hr_df.head()

In [None]:
hsi_df.head()

In [None]:
sir_df.head()

In [None]:
sample_df.head()

In [None]:
di_df.head()

In [None]:
# longitude-latitude cluster
xh = hsi_df['longitude']
yh = hsi_df['latitude']
plt.scatter(xh,yh)
plt.show()

In [None]:
xa = asi_df['longitude']
ya = asi_df['latitude']
plt.scatter(xa,ya)
plt.show()

In [None]:
# location counting
asi_location_df = pd.DataFrame(asi_df.air_area_name.str.split(' ', 2).tolist(), columns = ['province','city', 'street'])
asi_df = asi_df.join(asi_location_df)
asi_df.drop(['air_area_name'], axis=1, inplace=True)

In [None]:
hpg_location_df = pd.DataFrame(hsi_df.hpg_area_name.str.split(' ', 2).tolist(), columns = ['province','city', 'street'])
hsi_df = hsi_df.join(hpg_location_df)
hsi_df.drop(['hpg_area_name'], axis=1, inplace=True)

In [None]:
province_count = pd.concat([asi_df['province'], hsi_df['province']]).value_counts().size # 13
city_count = pd.concat([asi_df['city'], hsi_df['city']]).value_counts().size # 85
street_count = pd.concat([asi_df['street'], hsi_df['street']]).value_counts().size # 191
location_total = province_count + city_count + street_count # 289
print('location_total:', location_total)

In [131]:
# merge air_store_info & hpg_store_info dataset
link_df = pd.merge(asi_df, sir_df, on='air_store_id', how='outer')
rrvf_df = pd.merge(link_df, hsi_df, on='hpg_store_id', how='outer')
rrvf_df = rrvf_df.fillna(0)

In [132]:
# process latitude & longitude overlapping
count_latitude_x = (rrvf_df['latitude_x'] > 0) + 0
count_latitude_y = (rrvf_df['latitude_y'] > 0) + 0
count_latitude = count_latitude_x + count_latitude_y
mean_latitude = (rrvf_df['latitude_x'] + rrvf_df['latitude_y']) / count_latitude

count_longitude_x = (rrvf_df['longitude_x'] > 0) + 0
count_longitude_y = (rrvf_df['longitude_y'] > 0) + 0
count_longitude = count_longitude_x + count_longitude_y
mean_longitude = (rrvf_df['longitude_x'] + rrvf_df['longitude_y']) / count_longitude

rrvf_df.drop(['latitude_x', 'latitude_y', 'longitude_x', 'longitude_y'], axis=1, inplace=True)

In [133]:
ll_df = pd.DataFrame({'latitude': mean_latitude, 'longitude': mean_longitude})
rrvf_df = rrvf_df.join(ll_df)

In [None]:
# counting continuous holiday duration
record_arr = [['2016-01-01', 0]]
record_posi = 0
recording = True
for index, row in di_df.iterrows():
    if recording and row['holiday_flg'] is 1:
        record_arr[record_posi][1] += 1
    elif not recording and row['holiday_flg'] is 1:
        record_posi += 1
        record_arr.append([row['calendar_date'], 1])
        recording = True
    elif recording and row['holiday_flg'] is not 1:
        recording = False
    else:
        pass
record_arr 

In [None]:
# cluster
X = rrvf_df[['latitude', 'longitude']].values
kmeans = KMeans(n_clusters=9, random_state=42)
y_pred = kmeans.fit_predict(X)
plt.scatter(X[:, 1], X[:, 0], c=y_pred)
plt.show()

In [134]:
# constrcut hpg-air genre map
genre = rrvf_df[['air_genre_name', 'hpg_genre_name']]
union_genre = genre[genre['air_genre_name'] != 0][genre['hpg_genre_name'] != 0]
group_genre = union_genre.groupby(['hpg_genre_name','air_genre_name']).size()
genre_map = []
for hpg_genre in group_genre.index.levels[0]:
    target_air_genre = group_genre[hpg_genre].argmax()
    if hpg_genre == 'Italian':
        target_air_genre = 'Italian/French'
    genre_map.append([hpg_genre, target_air_genre])
genre_map = np.array(genre_map)

  This is separate from the ipykernel package so we can avoid doing imports until


In [135]:
# process genre
genre = []
for index, row in rrvf_df.iterrows():
    air_genre = row['air_genre_name']
    hpg_genre = row['hpg_genre_name']
    if air_genre != 0:
        genre.append(air_genre)
    elif air_genre == 0 and hpg_genre != 0:
        target_posi = np.argwhere(genre_map == hpg_genre)
        if target_posi.size != 0:
            genre_name = genre_map[target_posi[0][0], 1]
            genre.append(genre_name)
        else:
            genre.append(hpg_genre)
    else:
        pass

In [136]:
genre_df = pd.DataFrame(genre, columns=['genre'])
rrvf_df = rrvf_df.join(genre_df)
rrvf_df.drop(['air_genre_name', 'hpg_genre_name'], axis=1, inplace=True)
rrvf_df.head()

Unnamed: 0,air_store_id,air_area_name,hpg_store_id,hpg_area_name,latitude,longitude,genre
0,air_0f0cdeee6c9bf3d7,Hyōgo-ken Kōbe-shi Kumoidōri,0,0,34.695124,135.197852,Italian/French
1,air_fee8dcf4d619598e,Hyōgo-ken Kōbe-shi Kumoidōri,0,0,34.695124,135.197852,Italian/French
2,air_a17f0778617c76e2,Hyōgo-ken Kōbe-shi Kumoidōri,0,0,34.695124,135.197852,Italian/French
3,air_83db5aff8f50478e,Tōkyō-to Minato-ku Shibakōen,0,0,35.658068,139.751599,Italian/French
4,air_99c3eae84130c1cb,Tōkyō-to Minato-ku Shibakōen,0,0,35.658068,139.751599,Italian/French


In [141]:
# process area_name
area_name = []
for index, row in rrvf_df.iterrows():
    air_area_name = row['air_area_name']
    hpg_area_name = row['hpg_area_name']
    if air_area_name != 0:
        area_name.append(air_area_name)
    elif air_area_name == 0 and hpg_area_name != 0:
        area_name.append(hpg_area_name)
    else:
        print('?????????????????????')

In [143]:
area_name_df = pd.DataFrame(area_name, columns=['area_name'])
rrvf_df = rrvf_df.join(area_name_df)
rrvf_df.drop(['air_area_name', 'hpg_area_name'], axis=1, inplace=True)
rrvf_df.head()

Unnamed: 0,air_store_id,hpg_store_id,latitude,longitude,genre,area_name
0,air_0f0cdeee6c9bf3d7,0,34.695124,135.197852,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
1,air_fee8dcf4d619598e,0,34.695124,135.197852,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
2,air_a17f0778617c76e2,0,34.695124,135.197852,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
3,air_83db5aff8f50478e,0,35.658068,139.751599,Italian/French,Tōkyō-to Minato-ku Shibakōen
4,air_99c3eae84130c1cb,0,35.658068,139.751599,Italian/French,Tōkyō-to Minato-ku Shibakōen


In [145]:
otherrrvf_df = pd.read_csv('rrvf.csv')
otherrrvf_df.head()

Unnamed: 0.1,Unnamed: 0,air_store_id,hpg_store_id,genre,area_name
0,0,air_0f0cdeee6c9bf3d7,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
1,1,air_fee8dcf4d619598e,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
2,2,air_a17f0778617c76e2,0,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri
3,3,air_83db5aff8f50478e,0,Italian/French,Tōkyō-to Minato-ku Shibakōen
4,4,air_99c3eae84130c1cb,0,Italian/French,Tōkyō-to Minato-ku Shibakōen
