In [182]:
import pandas as pd

In [183]:
import glob

In [184]:
source_csv = glob.glob('../lvrland_crawler/lvr_src/*/*/*/*/*_lvr_land_*.csv')
print(len(source_csv))

108


In [202]:
h0 = []
h1 = []
for s in source_csv:
    sp = s.split('/')
    year, season, city, trade_type = sp[3], sp[4], sp[5], sp[6]
    new_column = f'{year}_{season}_{city}_{trade_type}'
    temp_df_h0 = pd.read_csv(s)
    temp_df_h1 = pd.read_csv(s, header=1)

    temp_df_h0['df_name'] = new_column
    temp_df_h1['df_name'] = new_column
    
    h0.append(temp_df_h0)
    h1.append(temp_df_h1)

In [186]:
# dataframe
df_source = pd.concat(h0, axis=0,ignore_index=True)
df_lvrland = pd.concat(h1, axis=0,ignore_index=True)

In [187]:
# 主要用途為住家用
is_self_residence = df_lvrland['main use']=='住家用'
is_self_residence

0          True
1         False
2          True
3          True
4          True
          ...  
626536     True
626537    False
626538    False
626539    False
626540    False
Name: main use, Length: 626541, dtype: bool

In [188]:
# 建物型態為住宅大樓
is_self_residence_building = df_lvrland['building state'].fillna('0').str.startswith('住宅大樓')
is_self_residence_building

0         False
1         False
2         False
3         False
4         False
          ...  
626536     True
626537    False
626538    False
626539    False
626540    False
Name: building state, Length: 626541, dtype: bool

In [189]:
import cn2an

In [190]:
# 樓層資料包含有中文數字+層數, 數字, 地下層，將其轉為integer
def convert_floor2int(x):
    if type(x) == str and not x.isdigit():
        if x != '地下層':
            x = cn2an.cn2an(x.strip("層"))
            return x
    return 0
        

In [191]:
# 總樓層數大於等於十三層
is_gte_floor_13 = df_lvrland['total floor number'].fillna(0).map(convert_floor2int) >= 13
is_gte_floor_13

0         False
1         False
2         False
3         False
4         False
          ...  
626536    False
626537    False
626538    False
626539    False
626540    False
Name: total floor number, Length: 626541, dtype: bool

In [237]:
# 3 filter dataframe
df_residence_gte_13 = df_lvrland[(is_self_residence & is_self_residence_building & is_gte_floor_13)]

In [281]:
# filter.csv

# import numpy as np
# output = np.vstack([df_residence_gte_13.columns.values, df_residence_gte_13.values])
# pd.DataFrame(output, columns=df_source.columns).to_csv('filter.csv', index=0)
pd.DataFrame(df_residence_gte_13).to_csv('filter.csv', index=0)

In [194]:
# 總件數
total_case = df_residence_gte_13.shape[0]
total_case

114033

In [195]:
# 總車位數(透過交易筆棟數)
total_berth = df_residence_gte_13['transaction pen number'].map(lambda x: int(x.split('車位')[-1])).sum()
total_berth

92471

In [212]:
# 平均總價元
avg_total_price = df_residence_gte_13['total price NTD'].mean(0).round()
avg_total_price

16940655.0

In [215]:
# 平均車位總價元(有車位才算，價格大於0)
avg_total_berth_price = df_residence_gte_13[df_residence_gte_13['the berth total price NTD']> 0]['the berth total price NTD'].mean().round()
avg_total_berth_price

2312811.0

In [223]:
# count.csv
data = {
    '總件數': [total_case],
    '總車位數': [total_berth],
    '平均總價元': [avg_total_price],
    '平均車位總價元': [avg_total_berth_price]
}

pd.DataFrame(data, columns=['總件數','總車位數','平均總價元', '平均車位總價元']).to_csv('count.csv', index=0)

In [285]:
from elasticsearch import Elasticsearch, helpers
es = Elasticsearch(
    ['localhost'],
    port=9200

)

In [288]:
mapping = {
  "settings": {
    "number_of_shards": 1
  },
  "mappings": {
    "properties": {
      "Building present situation pattern - room": {
        "type": "long"
      },
      "The villages and towns urban district": {
        "type": "keyword"
      },
      "Whether there is manages the organization": {
        "type": "keyword"
      },
      "berth shifting total area square meter": {
        "type": "double"
      },
      "building present situation pattern - compartmented": {
        "type": "keyword"
      },
      "building present situation pattern - hall": {
        "type": "long"
      },
      "building present situation pattern - health": {
        "type": "long"
      },
      "building shifting total area": {
        "type": "double"
      },
      "building state": {
        "type": "keyword"
      },
      "construction to complete the years": {
        "type": "double"
      },
      "df_name": {
        "type": "keyword"
      },
      "land sector position building sector house number plate": {
        "type": "keyword"
      },
      "land shifting total area square meter": {
        "type": "double"
      },
      "main building materials": {
        "type": "keyword"
      },
      "main use": {
        "type": "keyword"
      },
      "serial number": {
        "type": "keyword"
      },
      "shifting level": {
        "type": "keyword"
      },
      "the berth category": {
        "type": "keyword"
      },
      "the berth total price NTD": {
        "type": "long"
      },
      "the note": {
        "type": "text"
      },
      "the unit price (NTD / square meter)": {
        "type": "double"
      },
      "the use zoning or compiles and checks": {
        "type": "keyword"
      },
      "total floor number": {
        "type": "keyword"
      },
      "total price NTD": {
        "type": "long"
      },
      "transaction pen number": {
        "type": "keyword"
      },
      "transaction sign": {
        "type": "keyword"
      },
      "transaction year month and day": {
        "type": "long"
      }
    }
  }
}

In [None]:
# create index
response = es.indices.create(
    index="csv_data",
    body=mapping,
    ignore=400 # ignore 400 already exists code
)


In [None]:
# 傳送至 Elasticsearch
import csv
with open('filter.csv') as f:
    reader = csv.DictReader(f)
    helpers.bulk(es, reader, index='csv_data')