## Harga Harta Kediaman

single

In [212]:
import pandas as pd
from io import BytesIO
import requests
import numpy as np

# Sample URLs
urls = [
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP KL/Jadual Harga dan Sewa WPKL LPH 2022.xlsx",
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP LABUAN/Jadual Harga dan Sewa  Labuan LPH 2022.xlsx",
    # Add more URLs here
]

# Create an empty list to store DataFrames
dfs = []

# Specify the sheet names you want to read
target_sheets = ['.1']  # Sheets ending with '.1' or '.3'

# Iterate through each URL
for url in urls:
    # Get the content of the file from the URL
    response = requests.get(url)
    
    # Read sheets that match the criteria
    xls = pd.ExcelFile(BytesIO(response.content))
    sheet_names = [sheet for sheet in xls.sheet_names if sheet.endswith(tuple(target_sheets))]
    
    # Extract relevant information from the URL
    url_parts = url.split('/')
    url_file = f"{url_parts[-3]}/{url_parts[-2]}"
    
    # Iterate through each sheet
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet, skiprows=5)#,
        df.columns = ['district_mukim','sample_size','avg_landsm','avg_floorsm','price_2021','price_2022','avg_pricePerct']
        df = df.iloc[1:]
        
        # Drop rows with all NaN values
        df = df.dropna(how='all')
        
        # Check if df[1] is NaN and use df[0] value for a new column 'location'
        df['location'] = np.where(pd.isna(df[df.columns[1]]), df[df.columns[0]], df[df.columns[1]])
        df['location'] = df['location'].astype(str)
                
        # Identify strings containing numbers and replace with NaN
        pattern = r'\d'  # Regular expression to match any digit
        df['location'] = df['location'].where(~df['location'].str.contains(pattern), np.nan)

        # Forward-fill the 'location' column
        df['location'] = df['location'].fillna(method='ffill')
        
        # Create a new column 'types' at rows where df[0] is in uppercase
        df['types'] = np.where(df[df.columns[0]].str.isupper(), df[df.columns[0]], np.nan)
        
        # Forward-fill the 'types' column until a different value is encountered
        df['types'] = df['types'].fillna(method='ffill')

        # Remove rows where 'district_mukim' is equal to 'location'
        df = df[df[df.columns[0]] != df[df.columns[7]]]
        
        # Assuming df is your DataFrame
        df = df.dropna(subset=['sample_size'])
                
        # Add a new column 'filename' with the extracted information
        df['source'] = url_file
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_result_df = pd.concat(dfs, ignore_index=True)

# Display the final result DataFrame
final_result_df.head(30)


Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Salak South Low Cost Housing,1.0,102.19,81.75,,200000,ND,Mukim Kuala Lumpur,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
1,Bandar Baru Sri Petaling,1.0,143.0,86.405,,610000,ND,Mukim Petaling,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
2,Kepong Baru,1.0,132.847,107.3,,530000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
3,Taman Sri Segambut,1.0,136.56,96.575,,485000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
4,Wangsa Melawati,1.0,108.0,107.95,,630000,ND,Mukim Hulu Kelang,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
5,Salak South Garden,1.0,143.0,86.86,,560000,ND,Mukim Petaling,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
6,Million Garden,1.0,124.114,83.61,,700000,ND,SINGLE STOREY TERRACE,SINGLE STOREY TERRACE,2022/WP KL
7,Taman Golden,1.0,139.35,104.795,,447000,ND,SINGLE STOREY TERRACE,SINGLE STOREY TERRACE,2022/WP KL
8,Ipoh Road Garden,1.0,134.0,100.905,,450000,ND,Mukim Batu,SINGLE STOREY TERRACE,2022/WP KL
9,Kepong Baru,19.0,140.379526,103.085211,"490,000 - 830,000","500,000 - 778,000",Stable,Mukim Batu,SINGLE STOREY TERRACE,2022/WP KL


Bulk

In [None]:
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP KL/Jadual Harga dan Sewa WPKL LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP LABUAN/Jadual Harga dan Sewa  Labuan LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP PUTRAJAYA/Jadual Harga dan Sewa Putrajaya LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/KEDAH/Jadual Harga dan Sewa Kedah LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PERLIS/Jadual Harga dan Sewa Perlis LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PULAU PINANG/Jadual Harga dan Sewa Pulau Pinang LPH 2022.xls
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SELANGOR/Jadual Harga dan Sewa Selangor LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PAHANG/Jadual Harga dan Sewa Pahang LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/MELAKA/Jadual Harga dan Sewa Melaka LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/NEGERI SEMBILAN/Jadual Harga dan Sewa N. Sembilan LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PERAK/Jadual Harga dan Sewa Perak LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/TERENGGANU/Jadual Harga dan Sewa Terengganu LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/KELANTAN/Jadual Harga dan Sewa Kelantan LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/JOHOR/Jadual Harga dan Sewa Johor LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SABAH/Jadual Harga dan Sewa Sabah LPH 2022.xlsx",
"https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SARAWAK/Jadual Harga dan Sewa Sarawak LPH 2022.xlsx",

In [165]:
import pandas as pd
from io import BytesIO
import requests
import numpy as np

# Sample URLs
urls = [
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP KL/Jadual Harga dan Sewa WPKL LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP LABUAN/Jadual Harga dan Sewa  Labuan LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/WP PUTRAJAYA/Jadual Harga dan Sewa Putrajaya LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/KEDAH/Jadual Harga dan Sewa Kedah LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PERLIS/Jadual Harga dan Sewa Perlis LPH 2022.xlsx",
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PULAU PINANG/Jadual Harga dan Sewa Pulau Pinang LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SELANGOR/Jadual Harga dan Sewa Selangor LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PAHANG/Jadual Harga dan Sewa Pahang LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/MELAKA/Jadual Harga dan Sewa Melaka LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/NEGERI SEMBILAN/Jadual Harga dan Sewa N. Sembilan LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PERAK/Jadual Harga dan Sewa Perak LPH 2022.xlsx",
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/TERENGGANU/Jadual Harga dan Sewa Terengganu LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/KELANTAN/Jadual Harga dan Sewa Kelantan LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/JOHOR/Jadual Harga dan Sewa Johor LPH 2022.xlsx",
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SABAH/Jadual Harga dan Sewa Sabah LPH 2022.xlsx",
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SARAWAK/Jadual Harga dan Sewa Sarawak LPH 2022.xlsx"
    ]

# Create an empty list to store DataFrames
dfs = []

# Specify the sheet names you want to read
target_sheets = ['.1']  # Sheets ending with '.1' or '.3'

# Iterate through each URL
for url in urls:
    # Get the content of the file from the URL
    response = requests.get(url)
    
    # Read sheets that match the criteria
    xls = pd.ExcelFile(BytesIO(response.content), engine='openpyxl')
    sheet_names = [sheet for sheet in xls.sheet_names if sheet.endswith(tuple(target_sheets))]
    
    # Extract relevant information from the URL
    url_parts = url.split('/')
    url_file = f"{url_parts[-3]}/{url_parts[-2]}"
    
    # Iterate through each sheet
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet, skiprows=5)#,
        df.columns = ['district_mukim','sample_size','avg_landsm','avg_floorsm','price_2021','price_2022','avg_pricePerct']
        df = df.iloc[1:]
        
        # Drop rows with all NaN values
        df = df.dropna(how='all')
        
        # Check if df[1] is NaN and use df[0] value for a new column 'location'
        df['location'] = np.where(pd.isna(df[df.columns[1]]), df[df.columns[0]], df[df.columns[1]])
        df['location'] = df['location'].astype(str)
                
        # Identify strings containing numbers and replace with NaN
        pattern = r'\d'  # Regular expression to match any digit
        df['location'] = df['location'].where(~df['location'].str.contains(pattern), np.nan)

        # Forward-fill the 'location' column
        df['location'] = df['location'].fillna(method='ffill')
        
        # Create a new column 'types' at rows where df[0] is in uppercase
        df['types'] = np.where(df[df.columns[0]].str.isupper(), df[df.columns[0]], np.nan)
        
        # Forward-fill the 'types' column until a different value is encountered
        df['types'] = df['types'].fillna(method='ffill')

        # Remove rows where 'district_mukim' is equal to 'location'
        df = df[df[df.columns[0]] != df[df.columns[7]]]
        
        # Assuming df is your DataFrame
        df = df.dropna(subset=['sample_size'])
                
        # Add a new column 'filename' with the extracted information
        df['source'] = url_file
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_result_df = pd.concat(dfs, ignore_index=True)

# Display the final result DataFrame
final_result_df


  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Salak South Low Cost Housing,1.0,102.19,81.750000,,200000,ND,Mukim Kuala Lumpur,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
1,Bandar Baru Sri Petaling,1.0,143.0,86.405000,,610000,ND,Mukim Petaling,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
2,Kepong Baru,1.0,132.847,107.300000,,530000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
3,Taman Sri Segambut,1.0,136.56,96.575000,,485000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
4,Wangsa Melawati,1.0,108.0,107.950000,,630000,ND,Mukim Hulu Kelang,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
...,...,...,...,...,...,...,...,...,...,...
7044,Bay Resort Condominium,1.0,,320.340000,,1190000,ND,Bahagian Miri,CONDOMINIUM,2022/SARAWAK
7045,Serene Height,3.0,,45.666667,,"250,000 - 388,000",ND,Bahagian Miri,CONDOMINIUM,2022/SARAWAK
7046,Bintulu Beach Resort Condominium,2.0,,129.000000,,"520,000 - 550,000",ND,Bahagian Bintulu,CONDOMINIUM,2022/SARAWAK
7047,Dd Palm Spring Condominium,1.0,,204.380000,,920000,ND,Bahagian Bintulu,CONDOMINIUM,2022/SARAWAK


In [166]:
woPPTGSBH = final_result_df

In [214]:
woPPTGSBH

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Salak South Low Cost Housing,1.0,102.19,81.750000,,200000,ND,Mukim Kuala Lumpur,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
1,Bandar Baru Sri Petaling,1.0,143.0,86.405000,,610000,ND,Mukim Petaling,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
2,Kepong Baru,1.0,132.847,107.300000,,530000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
3,Taman Sri Segambut,1.0,136.56,96.575000,,485000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
4,Wangsa Melawati,1.0,108.0,107.950000,,630000,ND,Mukim Hulu Kelang,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
...,...,...,...,...,...,...,...,...,...,...
7044,Bay Resort Condominium,1.0,,320.340000,,1190000,ND,Bahagian Miri,CONDOMINIUM,2022/SARAWAK
7045,Serene Height,3.0,,45.666667,,"250,000 - 388,000",ND,Bahagian Miri,CONDOMINIUM,2022/SARAWAK
7046,Bintulu Beach Resort Condominium,2.0,,129.000000,,"520,000 - 550,000",ND,Bahagian Bintulu,CONDOMINIUM,2022/SARAWAK
7047,Dd Palm Spring Condominium,1.0,,204.380000,,920000,ND,Bahagian Bintulu,CONDOMINIUM,2022/SARAWAK


In [198]:
woPPTGSBH.shape[0]

7049

In [167]:
!pip install --upgrade xlrd
!pip install --upgrade openpyxl




[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


PPinang

In [171]:
import pandas as pd
from io import BytesIO
import requests
import numpy as np
import xlrd
import openpyxl

# Sample URLs
urls = [
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/PULAU%20PINANG/Jadual%20Harga%20dan%20Sewa%20Pulau%20Pinang%20LPH%202022.xls"
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/TERENGGANU/Jadual%20Harga%20dan%20Sewa%20Terengganu%20LPH%202022.xlsx"
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SABAH/Jadual%20Harga%20dan%20Sewa%20Sabah%20LPH%202022.xlsx"
 ]

# Create an empty list to store DataFrames
dfs = []

# Specify the sheet names you want to read
target_sheets = ['.1']  # Sheets ending with '.1' or '.3'

# Iterate through each URL
for url in urls:
    # Get the content of the file from the URL
    response = requests.get(url)
    
    # Read sheets that match the criteria
    xls = pd.ExcelFile(BytesIO(response.content), engine='xlrd')
    sheet_names = [sheet for sheet in xls.sheet_names if sheet.endswith(tuple(target_sheets))]
    
    # Extract relevant information from the URL
    url_parts = url.split('/')
    url_file = f"{url_parts[-3]}/{url_parts[-2]}"
    
    # Iterate through each sheet
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet, skiprows=5)#,
        df.columns = ['district_mukim','sample_size','avg_landsm','avg_floorsm','price_2021','price_2022','avg_pricePerct']
        df = df.iloc[1:]
        
        # Drop rows with all NaN values
        df = df.dropna(how='all')
        
        # Check if df[1] is NaN and use df[0] value for a new column 'location'
        df['location'] = np.where(pd.isna(df[df.columns[1]]), df[df.columns[0]], df[df.columns[1]])
        df['location'] = df['location'].astype(str)
                
        # Identify strings containing numbers and replace with NaN
        pattern = r'\d'  # Regular expression to match any digit
        df['location'] = df['location'].where(~df['location'].str.contains(pattern), np.nan)

        # Forward-fill the 'location' column
        df['location'] = df['location'].fillna(method='ffill')
        
        # Create a new column 'types' at rows where df[0] is in uppercase
        df['types'] = np.where(df[df.columns[0]].str.isupper(), df[df.columns[0]], np.nan)
        
        # Forward-fill the 'types' column until a different value is encountered
        df['types'] = df['types'].fillna(method='ffill')

        # Remove rows where 'district_mukim' is equal to 'location'
        df = df[df[df.columns[0]] != df[df.columns[7]]]
        
        # Assuming df is your DataFrame
        df = df.dropna(subset=['sample_size'])
                
        # Add a new column 'filename' with the extracted information
        df['source'] = url_file
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_result_df = pd.concat(dfs, ignore_index=True)

# Display the final result DataFrame
woPPonly = final_result_df

In [215]:
woPPonly

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Bertam Perdana,5.0,102,61.00,"200,000 - 225,000","184,000 - 235,000",Stable,Seberang Perai Utara,SINGLE STOREY LOW-COST TERRACE,2022/PULAU%20PINANG
1,Taman Cempedak,1.0,102,63.00,"145,000 - 160,000",155000,Stable,Seberang Perai Utara,SINGLE STOREY LOW-COST TERRACE,2022/PULAU%20PINANG
2,Taman Merbau Jaya,1.0,102,65.00,200000,218000,9,Seberang Perai Utara,SINGLE STOREY LOW-COST TERRACE,2022/PULAU%20PINANG
3,Taman Sri Menerong,3.0,93,55.00,"150,000 - 178,000","170,000 - 185,000",10.8,Seberang Perai Utara,SINGLE STOREY LOW-COST TERRACE,2022/PULAU%20PINANG
4,Taman Alma Jaya (Fasa 1),1.0,93,60.76,,240000,ND,Seberang Perai Tengah,SINGLE STOREY LOW-COST TERRACE,2022/PULAU%20PINANG
...,...,...,...,...,...,...,...,...,...,...
917,Residensi Sejati,3.0,-,116.00,"465,000 - 563,000","465,000 - 570,000",Stable,Seberang Perai Tengah,CONDOMINIUM,2022/PULAU%20PINANG
918,The Prominence,2.0,-,118.00,"410,000 - 450,000",450000,10.8,Seberang Perai Tengah,CONDOMINIUM,2022/PULAU%20PINANG
919,,3.0,-,140.00,"470,000 - 550,000","460,000 - 600,000",Stable,Seberang Perai Tengah,CONDOMINIUM,2022/PULAU%20PINANG
920,The Signature Condominium,2.0,-,101.00,"390,000 - 435,000","430,000 - 450,000",8.1,Seberang Perai Tengah,CONDOMINIUM,2022/PULAU%20PINANG


In [199]:
woPPonly.shape[0]

922

Terengganu

In [189]:
import pandas as pd
from io import BytesIO
import requests
import numpy as np
import xlrd
import openpyxl

# Sample URLs
urls = [
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/TERENGGANU/Jadual%20Harga%20dan%20Sewa%20Terengganu%20LPH%202022.xlsx"
    # "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SABAH/Jadual%20Harga%20dan%20Sewa%20Sabah%20LPH%202022.xlsx"
 ]

# Create an empty list to store DataFrames
dfs = []

# Specify the sheet names you want to read
target_sheets = ['.1']  # Sheets ending with '.1' or '.3'

# Iterate through each URL
for url in urls:
    # Get the content of the file from the URL
    response = requests.get(url)
    
    # Read sheets that match the criteria
    xls = pd.ExcelFile(BytesIO(response.content))
    sheet_names = [sheet for sheet in xls.sheet_names if sheet.endswith(tuple(target_sheets))]
    
    # Extract relevant information from the URL
    url_parts = url.split('/')
    url_file = f"{url_parts[-3]}/{url_parts[-2]}"
    
    # Iterate through each sheet
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet, skiprows=4)#,
        df = df.iloc[1:,:7]
        df.columns = ['district_mukim','sample_size','avg_landsm','avg_floorsm','price_2021','price_2022','avg_pricePerct']
        
        # Drop rows with all NaN values
        df = df.dropna(how='all')
        
        # Check if df[1] is NaN and use df[0] value for a new column 'location'
        df['location'] = np.where(pd.isna(df[df.columns[1]]), df[df.columns[0]], df[df.columns[1]])
        df['location'] = df['location'].astype(str)
                
        # Identify strings containing numbers and replace with NaN
        pattern = r'\d'  # Regular expression to match any digit
        df['location'] = df['location'].where(~df['location'].str.contains(pattern), np.nan)

        # Forward-fill the 'location' column
        df['location'] = df['location'].fillna(method='ffill')
        
        # Create a new column 'types' at rows where df[0] is in uppercase
        df['types'] = np.where(df[df.columns[0]].str.isupper(), df[df.columns[0]], np.nan)
        
        # Forward-fill the 'types' column until a different value is encountered
        df['types'] = df['types'].fillna(method='ffill')

        # Remove rows where 'district_mukim' is equal to 'location'
        df = df[df[df.columns[0]] != df[df.columns[7]]]
        
        # Assuming df is your DataFrame
        # df = df.dropna(subset=['sample_size'])
                
        # Add a new column 'filename' with the extracted information
        df['source'] = url_file
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_result_df = pd.concat(dfs, ignore_index=True)

# Display the final result DataFrame
woTRGonly = final_result_df

In [216]:
woTRGonly

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Belara,1.0,111,90.655000,150000,160000,6.666667,Kuala Terengganu,SINGLE STOREY LOW - COST TERRACE,2022/TERENGGANU
1,Alor Jambu,1.0,107,100.100000,170000,170000,Stable,Kuala Nerus,SINGLE STOREY LOW - COST TERRACE,2022/TERENGGANU
2,Gong Badak,5.0,122.4,101.292000,"150,000 - 190,000","150,000 - 210,000",6.132075,Kuala Nerus,SINGLE STOREY LOW - COST TERRACE,2022/TERENGGANU
3,Mengabang Lekor,1.0,103,78.030000,120000,130000,8.333333,Kuala Nerus,SINGLE STOREY LOW - COST TERRACE,2022/TERENGGANU
4,Seberang Takir,3.0,111.333333,87.138333,200000,"190,000 - 220,000",Stable,Kuala Nerus,SINGLE STOREY LOW - COST TERRACE,2022/TERENGGANU
...,...,...,...,...,...,...,...,...,...,...
301,Pangsapuri Ladang Tok Pelam (Tingkat 20),1.0,-,78.000000,,310000,ND,Kuala Terengganu,PANGSAPURI,2022/TERENGGANU
302,Pangsapuri Taman Puncak Kemajuan (Tingkat 5),1.0,-,122.630000,,380000,ND,Kuala Terengganu,PANGSAPURI,2022/TERENGGANU
303,Desa Ibai Apartment (Tingkat 2),1.0,-,125.000000,,365000,ND,Kuala Terengganu,CONDOMINIUM,2022/TERENGGANU
304,Icon Residence (Tingkat 16),1.0,-,115.000000,,570000,ND,Kuala Terengganu,CONDOMINIUM,2022/TERENGGANU


In [195]:
import pandas as pd
from io import BytesIO
import requests
import numpy as np
import xlrd
import openpyxl

# Sample URLs
urls = [
    "https://napic2.jpph.gov.my/storage/app/media//3-penerbitan/pasaran-harta-tanah/laporan-pasaran-harta-tahunan/2022/SABAH/Jadual%20Harga%20dan%20Sewa%20Sabah%20LPH%202022.xlsx"
 ]

# Create an empty list to store DataFrames
dfs = []

# Specify the sheet names you want to read
target_sheets = ['.1']  # Sheets ending with '.1' or '.3'

# Iterate through each URL
for url in urls:
    # Get the content of the file from the URL
    response = requests.get(url)
    
    # Read sheets that match the criteria
    xls = pd.ExcelFile(BytesIO(response.content))
    sheet_names = [sheet for sheet in xls.sheet_names if sheet.endswith(tuple(target_sheets))]
    
    # Extract relevant information from the URL
    url_parts = url.split('/')
    url_file = f"{url_parts[-3]}/{url_parts[-2]}"
    
    # Iterate through each sheet
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet, skiprows=5)#,
        df = df.iloc[1:,:7]
        df.columns = ['district_mukim','sample_size','avg_landsm','avg_floorsm','price_2021','price_2022','avg_pricePerct']
        
        # Drop rows with all NaN values
        df = df.dropna(how='all')
        
        # Check if df[1] is NaN and use df[0] value for a new column 'location'
        df['location'] = np.where(pd.isna(df[df.columns[1]]), df[df.columns[0]], df[df.columns[1]])
        df['location'] = df['location'].astype(str)
                
        # Identify strings containing numbers and replace with NaN
        pattern = r'\d'  # Regular expression to match any digit
        df['location'] = df['location'].where(~df['location'].str.contains(pattern), np.nan)

        # Forward-fill the 'location' column
        df['location'] = df['location'].fillna(method='ffill')
        
        # Create a new column 'types' at rows where df[0] is in uppercase
        df['types'] = np.where(df[df.columns[0]].str.isupper(), df[df.columns[0]], np.nan)
        
        # Forward-fill the 'types' column until a different value is encountered
        df['types'] = df['types'].fillna(method='ffill')

        # Remove rows where 'district_mukim' is equal to 'location'
        df = df[df[df.columns[0]] != df[df.columns[7]]]
        
        # Assuming df is your DataFrame
        # df = df.dropna(subset=['sample_size'])
                
        # Add a new column 'filename' with the extracted information
        df['source'] = url_file
        df = df.dropna(subset='district_mukim', axis=0)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_result_df = pd.concat(dfs, ignore_index=True)

# Display the final result DataFrame
woSBHonly = final_result_df

In [217]:
woSBHonly

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Taman Emas Takapan,1.0,209.0,84.00,,235000,ND,Beaufort,SINGLE STOREY TERRACED,2022/SABAH
1,Taman Selagon,1.0,212.0,78.00,,250000,ND,Beaufort,SINGLE STOREY TERRACED,2022/SABAH
2,Taman Sri Panglima,3.0,149.0,60.00,"210,000 - 220,000","180,000 - 250,000",Stable,Beaufort,SINGLE STOREY TERRACED,2022/SABAH
3,Taman Adika,1.0,186.0,79.00,240000,260000,8.333333,Keningau,SINGLE STOREY TERRACED,2022/SABAH
4,Taman Adika Phase 3,1.0,181.0,77.00,280000,260000,-7.142857,Keningau,SINGLE STOREY TERRACED,2022/SABAH
...,...,...,...,...,...,...,...,...,...,...
536,The Riverside Residence,2.0,,89.00,,"420,000 - 465,000",ND,Penampang,R55,2022/SABAH
537,Kingfisher Putatan Condominium,1.0,,94.00,,485000,ND,Putatan,R55,2022/SABAH
538,Sri Utama Condominium,1.0,,115.93,340000,362500,6.617647,Sandakan,R55,2022/SABAH
539,Utama South Condominium,4.0,,103.21,350000,"330,000 - 400,000",3.928571,Sandakan,R55,2022/SABAH


All

In [200]:
alloc = pd.concat([woPPTGSBH, woPPonly, woTRGonly, woSBHonly], axis=0)
alloc.head()

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Salak South Low Cost Housing,1.0,102.19,81.75,,200000,ND,Mukim Kuala Lumpur,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
1,Bandar Baru Sri Petaling,1.0,143.0,86.405,,610000,ND,Mukim Petaling,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
2,Kepong Baru,1.0,132.847,107.3,,530000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
3,Taman Sri Segambut,1.0,136.56,96.575,,485000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
4,Wangsa Melawati,1.0,108.0,107.95,,630000,ND,Mukim Hulu Kelang,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL


In [201]:
alloc.shape[0]

8818

In [202]:
print(woPPTGSBH.shape[0])
print(woPPonly.shape[0])
print(woTRGonly.shape[0])
print(woSBHonly.shape[0])

7049
922
306
541


In [203]:
7049+922+306+541

8818

In [219]:
alloc[['price_2021', 'price_2022']] = alloc[['price_2021', 'price_2022']].replace({',': ''}, regex=True)

In [220]:
alloc.head()

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
0,Salak South Low Cost Housing,1.0,102.19,81.75,,200000,ND,Mukim Kuala Lumpur,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
1,Bandar Baru Sri Petaling,1.0,143.0,86.405,,610000,ND,Mukim Petaling,SINGLE STOREY LOW - COST TERRACE,2022/WP KL
2,Kepong Baru,1.0,132.847,107.3,,530000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
3,Taman Sri Segambut,1.0,136.56,96.575,,485000,ND,Mukim Batu,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL
4,Wangsa Melawati,1.0,108.0,107.95,,630000,ND,Mukim Hulu Kelang,SINGLE STOREY MEDIUM - LOW COST TERRACE,2022/WP KL


In [221]:
alloc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8818 entries, 0 to 540
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   district_mukim  8595 non-null   object 
 1   sample_size     8818 non-null   float64
 2   avg_landsm      7754 non-null   object 
 3   avg_floorsm     8818 non-null   float64
 4   price_2021      4676 non-null   object 
 5   price_2022      8818 non-null   object 
 6   avg_pricePerct  8818 non-null   object 
 7   location        8818 non-null   object 
 8   types           8818 non-null   object 
 9   source          8818 non-null   object 
dtypes: float64(2), object(8)
memory usage: 757.8+ KB


In [222]:
alloc.to_csv('napic_jual2022.csv', index=False)

In [223]:
alloc.columns

Index(['district_mukim', 'sample_size', 'avg_landsm', 'avg_floorsm',
       'price_2021', 'price_2022', 'avg_pricePerct', 'location', 'types',
       'source'],
      dtype='object')

In [224]:
alloc.groupby('source').size()

source
2022/JOHOR               966
2022/KEDAH               918
2022/KELANTAN            326
2022/MELAKA              453
2022/NEGERI SEMBILAN     647
2022/PAHANG              588
2022/PERAK              1295
2022/PERLIS               91
2022/PULAU%20PINANG      922
2022/SABAH               541
2022/SARAWAK             720
2022/TERENGGANU          306
2022/WP KL               995
2022/WP LABUAN            36
2022/WP PUTRAJAYA         14
dtype: int64

In [225]:
# alloc.query("types == 'FLAT'")
flat_mask = alloc['types'].str.contains('FLAT')
flat_df = alloc[flat_mask]
flat_df

Unnamed: 0,district_mukim,sample_size,avg_landsm,avg_floorsm,price_2021,price_2022,avg_pricePerct,location,types,source
315,Flat DBKL Jalan Hang Tuah,2.0,,42.00,100000,8500 - 126000,5.5,RUMAH PANGSA /FLAT KOS RENDAH,RUMAH PANGSA /FLAT KOS RENDAH,2022/WP KL
316,PKNS Flat(Jalan Raja Muda Musa),1.0,,78.00,,450000,ND,RUMAH PANGSA /FLAT KOS RENDAH,RUMAH PANGSA /FLAT KOS RENDAH,2022/WP KL
317,PPR Pudu Ulu,1.0,,60.00,,170000,ND,RUMAH PANGSA /FLAT KOS RENDAH,RUMAH PANGSA /FLAT KOS RENDAH,2022/WP KL
318,Pudu Impian 1,3.0,,62.00,160000 - 185000,190000,6.082725,RUMAH PANGSA /FLAT KOS RENDAH,RUMAH PANGSA /FLAT KOS RENDAH,2022/WP KL
319,Rancangan Perumahan Sentul Fasa 3,2.0,,49.00,150000,145000 - 150000,Stable,RUMAH PANGSA /FLAT KOS RENDAH,RUMAH PANGSA /FLAT KOS RENDAH,2022/WP KL
...,...,...,...,...,...,...,...,...,...,...
387,Taman Fajar,2.0,,63.00,150000,140000 - 168000,2.666667,Kota Kinabalu,LOW - COST FLAT,2022/SABAH
388,Taman Pasir Putih Phase 3C,4.0,,56.00,160000 - 165000,160000 - 180000,6.923077,Putatan,LOW - COST FLAT,2022/SABAH
389,Taman Telipok Ria,17.0,,57.00,140000 - 160000,135000 - 170000,Stable,Tuaran,LOW - COST FLAT,2022/SABAH
390,Taman Sejati,6.0,,53.02,110000 - 130000,110000 - 130000,Stable,Sandakan,MEDIUM - COST FLAT,2022/SABAH


In [226]:
flat_df.to_csv('flat_df_harga.csv', index=False)

In [227]:
flat_sewa = pd.read_csv('flat_df_sewa.csv')

In [231]:
combine = pd.concat([flat_df,flat_sewa], axis=0).reset_index()

In [234]:
combine.columns

Index(['index', 'district_mukim', 'sample_size', 'avg_landsm', 'avg_floorsm',
       'price_2021', 'price_2022', 'avg_pricePerct', 'location', 'types',
       'source', 'rental_2021', 'rental_2022'],
      dtype='object')

In [236]:
sugscope = combine[['district_mukim','location', 'source','types', 'price_2021', 'price_2022']]

In [237]:
sugscope.to_csv('cadangan_strategi_lowcostarea_NAPIC2022.csv', index=False)