In [164]:
import pandas as pd
import numpy as np
import os
from glob import glob
import matplotlib.pyplot as plt

In [165]:
#Check number of prefectures in data
prefectures = pd.read_csv('prefecture_code.csv')
prefectures

Unnamed: 0,Code,JpName,EnName
0,1,北海道,Hokkaido
1,2,青森県,Aomori
2,3,岩手県,Iwate
3,4,宮城県,Miyagi
4,5,秋田県,Akita
5,6,山形県,Yamagata
6,7,福島県,Fukushima
7,8,茨城県,Ibaraki
8,9,栃木県,Tochigi
9,10,群馬県,Gunma


A total of 47 prefectures is in the trade_prices csv files, from this we're going to check the number of columns and what might be relevant

In [166]:
#Opening of one of the trade_prices csv file
df = pd.read_csv('trade_prices//01.csv')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,No,Type,Region,MunicipalityCode,Prefecture,Municipality,DistrictName,NearestStation,TimeToNearestStation,MinTimeToNearestStation,...,Classification,Breadth,CityPlanning,CoverageRatio,FloorAreaRatio,Period,Year,Quarter,Renovation,Remarks
0,1,Residential Land(Land Only),Residential Area,1101,Hokkaido,"Chuo Ward,Sapporo City",Asahigaoka,Maruyamakoen,28,28.0,...,City Road,10.2,Category I Exclusively Low-story Residential Zone,40.0,80.0,4th quarter 2018,2018,4,,
1,2,Residential Land(Land Only),Residential Area,1101,Hokkaido,"Chuo Ward,Sapporo City",Asahigaoka,Maruyamakoen,27,27.0,...,City Road,7.2,Category I Exclusively Low-story Residential Zone,40.0,80.0,4th quarter 2018,2018,4,,
2,3,"Pre-owned Condominiums, etc.",,1101,Hokkaido,"Chuo Ward,Sapporo City",Asahigaoka,Maruyamakoen,20,20.0,...,,,Neighborhood Commercial Zone,80.0,200.0,4th quarter 2018,2018,4,Not yet,
3,4,Residential Land(Land Only),Residential Area,1101,Hokkaido,"Chuo Ward,Sapporo City",Asahigaoka,Maruyamakoen,26,26.0,...,Private Road,4.0,Category I Exclusively Low-story Residential Zone,40.0,80.0,2nd quarter 2018,2018,2,,
4,5,Residential Land(Land Only),Residential Area,1101,Hokkaido,"Chuo Ward,Sapporo City",Asahigaoka,Maruyamakoen,23,23.0,...,City Road,7.2,Category I Exclusively Low-story Residential Zone,40.0,80.0,2nd quarter 2018,2018,2,,


In [167]:
print(df.shape)
print(df.columns)

(186238, 38)
Index(['No', 'Type', 'Region', 'MunicipalityCode', 'Prefecture',
       'Municipality', 'DistrictName', 'NearestStation',
       'TimeToNearestStation', 'MinTimeToNearestStation',
       'MaxTimeToNearestStation', 'TradePrice', 'FloorPlan', 'Area',
       'AreaIsGreaterFlag', 'UnitPrice', 'PricePerTsubo', 'LandShape',
       'Frontage', 'FrontageIsGreaterFlag', 'TotalFloorArea',
       'TotalFloorAreaIsGreaterFlag', 'BuildingYear', 'PrewarBuilding',
       'Structure', 'Use', 'Purpose', 'Direction', 'Classification', 'Breadth',
       'CityPlanning', 'CoverageRatio', 'FloorAreaRatio', 'Period', 'Year',
       'Quarter', 'Renovation', 'Remarks'],
      dtype='object')


From this, it appears that the dataset contains 38 columns and 186238 entries. It can also be noted that the prefecture has not been edited to its prefecture code. All data would now then be loaded into a single pandas DataFrame

In [None]:
df.drop(df.index, inplace=True)
#Describing dtype in each of the column 
df = pd.concat((pd.read_csv(file, dtype={
    'No': 'int64',  'Type': 'category', 'Region': 'category', 'MunicipalityCode': 'category', 'Prefecture': 'category',
    'Municipality': 'category', 'DistrictName': 'category', 'NearestStation': 'category', 'TimeToNearestStation': 'string',
    'MinTimeToNearestStation': 'string', 'MaxTimeToNearestStation': 'string', 'TradePrice': 'float', 'FloorPlan': 'category',
    'Area': 'float', 'AreaIsGreaterFlag': 'bool', 'UnitPrice': 'float', 'PricePerTsubo': 'float', 'LandShape': 'category',
    'Frontage': 'float', 'FrontageIsGreaterFlag': 'bool', 'TotalFloorArea': 'float', 'TotalFloorAreaIsGreaterFlag': 'float',
    'BuildingYear': 'float', 'PrewarBuilding': 'bool', 'Structure': 'category', 'Use': 'category', 'Purpose': 'category',
    'Direction': 'category', 'Classification': 'category', 'Breadth': 'float', 'CityPlanning': 'category',
    'CoverageRatio': 'float', 'FloorAreaRatio': 'float', 'Period': 'category', 'Year': 'int64', 'Quarter': 'int64',
    'Renovation': 'category', 'Remarks': 'string'
}) for file in glob('trade_prices/*.csv')), ignore_index=True)

In [None]:
df.head()

From the following columns, the column 'No' can be omitted from the dataset as it would have no purpose on the prediction of real estate prices

As TimeToNearestStation and MinTime and MaxTime ToNearestStation is related to each other where TimeToNearestStation would be a string in which it describes the time range when one could reach the nearest station, this column is also omitted

As the 'Period' Column only pertains to the 'Quarter' Columns, this column can also be removed

The 'Remarks' Column is rarely used, this column is also removed from the dataset.

In [None]:
df.drop(columns=['No','TimeToNearestStation', 'Period', 'Remarks'], inplace=True)
df.head()

In [None]:
#Shows number of missing data in each of the column
print(df.isnull().sum())
print(df.shape)

Columns in which more than 1 million records of data is missing would be omitted, the following columns are:
- Floor Plan
- Structure
- Use
- Purpose
- Renovation
- Region
- Unit Price
- PricePerTsubo
- LandShape
- Frontage
- TotalFloorArea
- BuildingYear
- Structure
- Use
- Purpose
- Direction
- Classification
- Breadth

In [None]:
df.drop(columns=['FloorPlan','Structure', 'Use', 'Purpose', 'Renovation', 'Region' ,'UnitPrice',
                'PricePerTsubo', 'LandShape', 'Frontage', 'TotalFloorArea','BuildingYear','Structure','Use',
                'Purpose','Direction','Classification','Breadth'], inplace=True)
df.head()

In [None]:
print(df.isnull().sum())
print(df.shape)

The remaining data that have null values would be removed from the dataset

In [None]:
df = df.dropna()
print(df.isnull().sum())
print(df.shape)

From this, it appears that approximately 3.07 million records would be used for the training and testing of the dataset.

The model would be attempting to predict the TradePrice column based on the values of the other columns.

In [None]:
#Know correlation between data, data that has a correlation value of more than 0.5 would have one of the columns be omitted
df.corr()

From the data given above, the columns
- Area : AreaIsGreaterFlag
- FloorAreaRatio : CoverageRatio
Are moderately to highly moderated, thus AreaIsGreaterFlag column and CoverageRatio would be dropped from the dataset

In [None]:
df.drop(columns=['AreaIsGreaterFlag', 'CoverageRatio'], inplace=True)
df.head()

Now, as the data has prices of different real estates from 47 different prefectures, we want to know if there is any difference between the average prices between each of the prefectures

In [None]:
#Conversion of Yen to Million Yen
df['TradePrice'] = df['TradePrice'].div(1000000)

In [None]:
prefectureTP = df.groupby('Prefecture', as_index = False)['TradePrice'].mean().sort_values('TradePrice', ascending=False)
fig, ax = plt.subplots(figsize=(10,22))
ax.barh(prefectureTP['Prefecture'], prefectureTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Prefecture', fontsize=20)
plt.yticks(fontsize=12)
plt.show()

From the horizontal bar graph given, it appears that one of the driving factors for the price of a property is the location of the property itself. For further analysis, the different municipalities per prefecture would be compared if it also affects the mean price of a location

In [None]:
print(len(df['Municipality'].unique()))
print(len(df.loc[df['Prefecture'] == 'Tokyo']['Municipality'].unique()))

As the total number of prefecture per Municipality is too high, a sample would just be taken where five prefecture would be chosen where their own municipalities property price would be compared amongst each other.

In [None]:
MuniTP = df.loc[df['Prefecture'] == 'Tokyo'].groupby('Municipality', 
                                                          as_index = False).mean().sort_values('TradePrice', ascending = False)

fig, ax = plt.subplots(figsize=(10,22))
ax.barh(MuniTP['Municipality'], MuniTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Municipality in Tokyo', fontsize=20)
plt.yticks(fontsize=12)
plt.show()

In [None]:
MuniTP = df.loc[df['Prefecture'] == 'Akita Prefecture'].groupby('Municipality', 
                                                          as_index = False).mean().sort_values('TradePrice', ascending = False)

fig, ax = plt.subplots(figsize=(10,12))
ax.barh(MuniTP['Municipality'], MuniTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Municipality in Akita Prefecture', fontsize=20)
plt.yticks(fontsize=12)
plt.show()

In [None]:
MuniTP = df.loc[df['Prefecture'] == 'Hokkaido'].groupby('Municipality', 
                                                          as_index = False).mean().sort_values('TradePrice', ascending = False)

fig, ax = plt.subplots(figsize=(10,30))
ax.barh(MuniTP['Municipality'], MuniTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Municipality in Hokkaido', fontsize=20)
plt.yticks(fontsize=12)
plt.show()

In [None]:
MuniTP = df.loc[df['Prefecture'] == 'Fukushima Prefecture'].groupby('Municipality', 
                                                          as_index = False).mean().sort_values('TradePrice', ascending = False)
fig, ax = plt.subplots(figsize=(10,30))
ax.barh(MuniTP['Municipality'], MuniTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Municipality in Fukushima Prefecture', fontsize=20)
plt.yticks(fontsize=12)
plt.show()

In [None]:
MuniTP = df.loc[df['Prefecture'] == 'Hiroshima Prefecture'].groupby('Municipality', 
                                                          as_index = False).mean().sort_values('TradePrice', ascending = False)
fig, ax = plt.subplots(figsize=(10,15))
ax.barh(MuniTP['Municipality'], MuniTP['TradePrice'])
ax.invert_yaxis()
ax.margins(y=0.01)
ax.set_xlabel('Million Yen', fontsize = 15)
ax.set_title('Average Prices per Municipality in Hiroshima Prefecture', fontsize=20)
plt.yticks(fontsize=12)
plt.show()