# 1. Merge all files

In [35]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PowerTransformer,LabelEncoder,OneHotEncoder,RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# from xgboost import XGBRegressor
from datetime import datetime, timedelta
import os
import re


In [36]:
folder_path = "D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data"
file_list = os.listdir(folder_path)

merged_df = pd.DataFrame()

# Loop qua từng file CSV và merge vào DataFrame
for file in file_list:
    if "final_bds_" in file:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        merged_df = pd.concat([merged_df, df], ignore_index=True)

print(len(merged_df))

26061


In [37]:
crawl = merged_df[['City','District','Ward','area_value','price_value_trieu','price_per_m2_final']].drop_duplicates()

In [38]:
crawl

Unnamed: 0,City,District,Ward,area_value,price_value_trieu,price_per_m2_final
0,Trà Vinh,Trà Vinh,Phường 4,24.0,9.0,375000.000000
1,Hà Nội,Quận Cầu Giấy,Phường Dịch Vọng Hậu,500.0,245.0,490000.000000
2,Hà Nội,Quận Bắc Từ Liêm,Phường Cổ Nhuế 1,550.0,40.0,72727.272727
3,Hà Nội,Quận Nam Từ Liêm,Phường Mỹ Đình 1,100.0,50.0,500000.000000
4,Hà Nội,Quận Thanh Xuân,Phường Nhân Chính,175.0,85.0,485714.285714
...,...,...,...,...,...,...
24959,Hà Nội,Quận Cầu Giấy,Phường Trung Hòa,90.0,65.0,722222.222222
24960,Hồ Chí Minh,Quận 1,Phường Tân Định,520.0,100.0,192307.692308
24961,Hà Nội,Quận Hoàn Kiếm,Phường Hàng Bồ,50.0,35.0,700000.000000
24963,Hồ Chí Minh,Quận Tân Phú,Phường Tân Sơn Nhì,100.0,22.0,220000.000000


In [39]:
# merged_df = merged_df.drop_duplicates()
crawl.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final.csv", index=False, mode = 'w', encoding="utf-8-sig")

# 2. Check data 

In [40]:
df = pd.read_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final.csv")
# df.head()
print(f"The total rows of dataframe is {len(df)}")

The total rows of dataframe is 15061


In [41]:
df.head(1)

Unnamed: 0,City,District,Ward,area_value,price_value_trieu,price_per_m2_final
0,Trà Vinh,Trà Vinh,Phường 4,24.0,9.0,375000.0


In [42]:
pattern_city = r'(?i)(Thành phố|Tỉnh)\s+'
df['City'] = df['City'].str.replace(pattern_city, '', regex=True)
df['City'] = df['City'].str.normalize('NFC')  # Chuẩn hóa Unicode NFC
df['City'] = df['City'].replace('Hà Nội', 'Hà Nội', regex=True)
df['City'] = df['City'].replace('Bà Rịa-Vũng Tàu', 'Bà Rịa Vũng Tàu', regex=True)
df['City'] = df['City'].replace('Thừa Thiên-Huế', 'Thừa Thiên Huế', regex=True)
df['City']


0           Trà Vinh
1             Hà Nội
2             Hà Nội
3             Hà Nội
4             Hà Nội
            ...     
15056         Hà Nội
15057    Hồ Chí Minh
15058         Hà Nội
15059    Hồ Chí Minh
15060         Hà Nội
Name: City, Length: 15061, dtype: object

In [43]:
# Tìm kiếm các giá trị có chứa số cột District
has_number = df['District'].str.contains('\d')
df['District'] = df.apply(lambda row: re.sub(r'(?i)(Quận|Huyện|Thị Xã|Thị Trấn)\s+', '', row['District']).strip() if not has_number[row.name] else row['District'], axis=1)
df['District'] = df['District'].replace('Phan Rang - Tháp Chàm', 'Phan Rang-Tháp Chàm', regex=True)
df['District']


0           Trà Vinh
1           Cầu Giấy
2        Bắc Từ Liêm
3        Nam Từ Liêm
4         Thanh Xuân
            ...     
15056       Cầu Giấy
15057         Quận 1
15058      Hoàn Kiếm
15059        Tân Phú
15060      Hoàng Mai
Name: District, Length: 15061, dtype: object

In [44]:
# Tìm kiếm các giá trị có chứa số cột District
has_number_ward = df['Ward'].str.contains('\d')
df['Ward'] = df.apply(lambda row: re.sub(r'(?i)(Xã|Phường)\s+', '', row['Ward']).strip() if not has_number_ward[row.name] else row['Ward'], axis=1)
df['Ward']


0                Phường 4
1           Dịch Vọng Hậu
2        Phường Cổ Nhuế 1
3        Phường Mỹ Đình 1
4              Nhân Chính
               ...       
15056           Trung Hòa
15057            Tân Định
15058             Hàng Bồ
15059         Tân Sơn Nhì
15060                 NaN
Name: Ward, Length: 15061, dtype: object

In [45]:
# pattern_street = r'(?i)(Đường|Phố|Quốc Lộ|QL.)\s+'
# pattern_street2 = r'.*(Đường|Phố|Quốc Lộ|QL\.)\s+'
# df['Street'] = df['Street'].str.replace(pattern_street2, '', regex=True)
# df['Street'] = df['Street'].str.replace(pattern_street, '', regex=True)
# df['Street'] = df['Street'].replace('Mâu Thân', 'Mậu Thân', regex=True)
# df['Street'] = df['Street'].str.replace(r'uỳ', 'ùy', regex=True)
# df['Street'] = df['Street'].str.replace(r'uý', 'úy', regex=True)
# df['Street'] = df['Street'].str.replace(r'uỹ', 'úy', regex=True)
# df['Street'] = df['Street'].str.replace(r'uỵ', 'ụy', regex=True)
# df['Street'] = df['Street'].str.replace(r'uỷ', 'ủy', regex=True)
# df['Street']

In [46]:
df = df.dropna(subset=['price_per_m2_final','City','District'])
df.__len__()

15004

In [47]:
# df_center = df.loc[(df['City']=="Hà Nội") | (df['City'] == "Hồ Chí Minh")]
# df_center.__len__()

# 3. Model - test

In [48]:
final = df.dropna()
final.__len__()

8087

In [49]:
final.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final_clean.csv", index=False, mode = 'w', encoding="utf-8-sig")

In [50]:
final = pd.read_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final_clean.csv")


In [51]:
print(final.columns)
# df = df[["area_value", "price_per_m2_final", "district", "city", "Type"]]

Index(['City', 'District', 'Ward', 'area_value', 'price_value_trieu',
       'price_per_m2_final'],
      dtype='object')


In [52]:
# df_street = df_street.dropna(subset=['price_per_m2_final','City','District','price_value_trieu'])

In [53]:
final['area_value'].describe()

count      8087.000000
mean        676.495207
std        9075.234707
min           1.000000
25%          80.000000
50%         145.000000
75%         330.000000
max      550000.000000
Name: area_value, dtype: float64

In [54]:
# df_street = df_street[(df_street['area_value'] > 10) & (df_street['area_value'] < 400)]

In [55]:
# df_street['price_per_m2_final'].describe()

In [56]:
# df_street = df_street[(df_street['price_per_m2_final'] > 1000) & (df_street['price_per_m2_final'] < 10000000)]

In [57]:
# df_street.info()

In [58]:
X = final.drop('price_per_m2_final', axis=1)
y = final['price_per_m2_final']

# X2 = df.drop('price_per_m2_final', axis=1)
# y2 = df['price_per_m2_final']

# X3 = df_center.drop('price_per_m2_final', axis=1)
# y3 = df_center['price_per_m2_final']

# X4 = df_street.drop('price_value_trieu', axis=1)
# y4 = df_street['price_value_trieu']


In [59]:
# best_random_state = None
# best_score = float('-inf')

# # Repeat with different random_state values
# for random_state in range(100):
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)
    
#     # Define preprocessing steps
#     numeric_features = ['area_value']
#     categorical_features = ['City', 'District', 'Ward','Street']
#     # categorical_features = ['City', 'District', 'Ward']
#     numeric_transformer = Pipeline(steps=[('scaler', RobustScaler())])
#     categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
#     preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
#                                                    ('cat', categorical_transformer, categorical_features)])
    
#     # Build the pipeline
#     model = Pipeline(steps=[('preprocessor', preprocessor),
#                             ('regressor', LinearRegression())])
    
#     # Train the model
#     model.fit(X_train, y_train)
    
#     # Evaluate performance
#     y_pred = model.predict(X_test)
#     score = r2_score(y_test, y_pred)

#     # Update best random_state if needed
#     if score > best_score:
#         best_score = score
#         best_random_state = random_state
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = mse ** 0.5

# print("Best random_state:", best_random_state)
# print("Best R-squared score:", best_score)
# print("Best RMSE score:", rmse)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=21)

#Tranform lại data 
numeric_features = ['area_value']
categorical_features = ['City', 'District', 'Ward']

numeric_transformer = Pipeline(steps=[('scaler', RobustScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [61]:
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score
# # list các mô hình được lựa chọn
# models = [LinearRegression(), RandomForestRegressor(), DecisionTreeRegressor(), XGBRegressor(), GradientBoostingRegressor()]


# # Xác định KFold
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=7)

# all_scores = []
# # Đánh giá toàn bộ các mô hình trên tập K-Fold đã chia
# for check in models:
#   completed_pl = Pipeline(
#     steps=[("preprocessor", preprocessor), ("regressor", check)]
#   )

#   mse_scores = -cross_val_score(completed_pl, X_train, y_train, cv=cv, n_jobs=-1,scoring='neg_mean_squared_error')
#   # mse = mean_squared_error(y_test, y_pred)
#   rmse = np.sqrt(mse_scores)
#   all_scores.append(rmse)

In [62]:
# # Draw bboxplot 
# model_names = ['LinearRegression', 'RandomForestRegressor', 'DecisionTreeRegressor', 'XGBRegressor', 'GradientBoostingRegressor']

# plt.figure(figsize=(8, 4))
# plt.boxplot(all_scores)
# plt.xlabel('Model', fontsize=8)
# plt.ylabel('RMSE', fontsize=8)
# plt.xticks(np.arange(len(model_names))+1, model_names, rotation=90, fontsize=8)
# plt.title("Scores Metrics", fontsize=18)

In [63]:
# # Xây dựng pipeline với mô hình Regression
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('regressor', RandomForestRegressor())])

# model.fit(X_train, y_train)

# score = model.score(X_test, y_test)
# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# rmse = mse ** 0.5

# print(f'R^2 Score: {score}')
# print("RMSE score:", rmse)

In [64]:
from sklearn.model_selection import GridSearchCV

# Define pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

# Define param grid
param_grid = {
    'regressor__n_estimators': [100, 200], #Số cây trong mô hình randomforest
    'regressor__max_depth': [None, 10, 20], #Độ sâu mỗi cây
    'regressor__min_samples_split': [2, 20], #Số lượng mẫu tối thiểu trong 1 nút cây
    'regressor__min_samples_leaf': [1, 2] #Số lượng mẫu tối thiểu mà một lá cây phải có
    }

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)


# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)

# Use best model to predict on test set
best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test) 
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'R^2 Score: {score}')
print("RMSE score:", rmse)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 20, 'regressor__n_estimators': 100}
R^2 Score: 0.304596601032032
RMSE score: 41918422.908080615


In [65]:
model

In [66]:
#Kiểm tra mô hình bằng k-fold cross valid
from sklearn.model_selection import cross_val_score, KFold

# Define the KFold cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=21)

# Perform cross-validation on the model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=kf)

# Calculate the mean and standard deviation of the cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print("Mean Cross-Validation Score:", mean_cv_score)
print("Standard Deviation of Cross-Validation Scores:", std_cv_score)

Mean Cross-Validation Score: 0.10578400083798707
Standard Deviation of Cross-Validation Scores: 0.5402720256524907


In [67]:
import pickle

# Lưu mô hình vào file
model_filename_pickle = r'D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\model\RandomForestRegressor_rental_cost.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(best_model, file)

# Để sử dụng mô hình từ file pickle
with open(model_filename_pickle, 'rb') as file:
    loaded_model_pickle = pickle.load(file)


In [74]:
new_data_1 = {
            'area_value': [20],
            'City': ['Hà Nội'],
            'District': ['Bắc Từ Liêm'],
            'Ward': ['Liên Mạc']
            }

new_df_1 = pd.DataFrame(new_data_1)

new_prediction_pickle = loaded_model_pickle.predict(new_df_1)
print(f'Predicted Price per m2 using loaded pickle model: {new_prediction_pickle}')

Predicted Price per m2 using loaded pickle model: [554804.12245137]


In [69]:
from unidecode import unidecode

In [76]:
train_groupby = X_train[['City', 'District', 'Ward','area_value']]
train_groupby_no_area = train_groupby.rename(columns={'area_value': 'median_area'})
train_groupby_no_area = train_groupby_no_area.groupby(['City', 'District', 'Ward']).median().reset_index()
train_groupby_no_area['Concatenated'] = train_groupby_no_area[['City', 'District', 'Ward']].apply(
    lambda row: ''.join(unidecode(str(val)).replace(' ', '_') for val in row), axis=1
)
train_groupby_no_area

Unnamed: 0,City,District,Ward,median_area,Concatenated
0,Ahmedabad,Ahmedabad,Park Hill,100.0,AhmedabadAhmedabadPark_Hill
1,An Giang,Châu Phú,Bình Mỹ,100.0,An_GiangChau_PhuBinh_My
2,An Giang,Long Xuyên,Mỹ Bình,3493.0,An_GiangLong_XuyenMy_Binh
3,An Giang,Long Xuyên,Mỹ Hòa,60.0,An_GiangLong_XuyenMy_Hoa
4,An Giang,Long Xuyên,Mỹ Xuyên,80.0,An_GiangLong_XuyenMy_Xuyen
...,...,...,...,...,...
850,Đồng Nai,Trảng Bom,Xã Hố Nai 3,4000.0,Dong_NaiTrang_BomXa_Ho_Nai_3
851,Đồng Nai,Trảng Bom,Đông Hòa,8250.0,Dong_NaiTrang_BomDong_Hoa
852,Đồng Nai,Vĩnh Cửu,Thiện Tân,1454.0,Dong_NaiVinh_CuuThien_Tan
853,Đồng Nai,Vĩnh Cửu,Thạnh Phú,247.0,Dong_NaiVinh_CuuThanh_Phu


In [77]:
train_groupby_percentiles = train_groupby.groupby(['City', 'District', 'Ward']).describe().reset_index()
train_groupby_percentiles['Concatenated'] = train_groupby_percentiles[['City', 'District', 'Ward']].apply(
    lambda row: ''.join(unidecode(str(val)).replace(' ', '_') for val in row), axis=1
)
# train_groupby_percentiles['Concatenated'] = train_groupby_percentiles['Concatenated'].str.replace(' ','')
train_groupby_percentiles

Unnamed: 0_level_0,City,District,Ward,area_value,area_value,area_value,area_value,area_value,area_value,area_value,area_value,Concatenated
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,25%,50%,75%,max,Unnamed: 12_level_1
0,Ahmedabad,Ahmedabad,Park Hill,1.0,100.000000,,100.0,100.0,100.0,100.0,100.0,AhmedabadAhmedabadPark_Hill
1,An Giang,Châu Phú,Bình Mỹ,1.0,100.000000,,100.0,100.0,100.0,100.0,100.0,An_GiangChau_PhuBinh_My
2,An Giang,Long Xuyên,Mỹ Bình,1.0,3493.000000,,3493.0,3493.0,3493.0,3493.0,3493.0,An_GiangLong_XuyenMy_Binh
3,An Giang,Long Xuyên,Mỹ Hòa,1.0,60.000000,,60.0,60.0,60.0,60.0,60.0,An_GiangLong_XuyenMy_Hoa
4,An Giang,Long Xuyên,Mỹ Xuyên,1.0,80.000000,,80.0,80.0,80.0,80.0,80.0,An_GiangLong_XuyenMy_Xuyen
...,...,...,...,...,...,...,...,...,...,...,...,...
850,Đồng Nai,Trảng Bom,Xã Hố Nai 3,7.0,4942.857143,3688.657002,1000.0,2500.0,4000.0,7300.0,10000.0,Dong_NaiTrang_BomXa_Ho_Nai_3
851,Đồng Nai,Trảng Bom,Đông Hòa,2.0,8250.000000,8131.727984,2500.0,5375.0,8250.0,11125.0,14000.0,Dong_NaiTrang_BomDong_Hoa
852,Đồng Nai,Vĩnh Cửu,Thiện Tân,2.0,1454.000000,1479.267386,408.0,931.0,1454.0,1977.0,2500.0,Dong_NaiVinh_CuuThien_Tan
853,Đồng Nai,Vĩnh Cửu,Thạnh Phú,1.0,247.000000,,247.0,247.0,247.0,247.0,247.0,Dong_NaiVinh_CuuThanh_Phu


In [79]:

# Chọn chỉ mục cho các percentiles 20 và 80
percentiles_from = train_groupby_percentiles.xs('min', level=1, axis=1).rename(columns={'area_value': 'area_from'})
percentiles_upto = train_groupby_percentiles.xs('75%', level=1, axis=1).rename(columns={'area_value': 'area_upto'})

# Gộp các percentiles vào một DataFrame mới
percentiles_df = pd.concat([percentiles_from, percentiles_upto,train_groupby_percentiles[['Concatenated']]], axis=1)
percentiles_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col.strip() for col in percentiles_df.columns.values]
percentiles_df.columns = [col[:-1] if col.endswith('_') and col != 'Concatenated' else col for col in percentiles_df.columns]

# Hiển thị DataFrame mới
percentiles_df

Unnamed: 0,area_from,area_upto,Concatenated
0,100.0,100.0,AhmedabadAhmedabadPark_Hill
1,100.0,100.0,An_GiangChau_PhuBinh_My
2,3493.0,3493.0,An_GiangLong_XuyenMy_Binh
3,60.0,60.0,An_GiangLong_XuyenMy_Hoa
4,80.0,80.0,An_GiangLong_XuyenMy_Xuyen
...,...,...,...
850,1000.0,7300.0,Dong_NaiTrang_BomXa_Ho_Nai_3
851,2500.0,11125.0,Dong_NaiTrang_BomDong_Hoa
852,408.0,1977.0,Dong_NaiVinh_CuuThien_Tan
853,247.0,247.0,Dong_NaiVinh_CuuThanh_Phu


In [80]:
train_groupby_percentiles = pd.merge(train_groupby_no_area,percentiles_df, on='Concatenated')
train_groupby_percentiles = train_groupby_percentiles[['City','District','Ward','area_from','area_upto','median_area','Concatenated']]
train_groupby_percentiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City          855 non-null    object 
 1   District      855 non-null    object 
 2   Ward          855 non-null    object 
 3   area_from     855 non-null    float64
 4   area_upto     855 non-null    float64
 5   median_area   855 non-null    float64
 6   Concatenated  855 non-null    object 
dtypes: float64(3), object(4)
memory usage: 46.9+ KB


In [81]:
# Tạo DataFrame mới với các cột 'City', 'District', 'Ward', 'Street' từ train_groupby_percentiles
train_df_subset = train_groupby_percentiles[['City','District','Ward','area_from','area_upto','median_area','Concatenated']].copy()

# Dự đoán giá trị cho area_value min
train_df_subset['area_value'] = train_df_subset['area_from']
predicted_prices_from = best_model.predict(train_df_subset)
train_df_subset['price_from'] = predicted_prices_from  * train_df_subset['area_from'] / 1000000

# Dự đoán giá trị cho area_value max
train_df_subset['area_value'] = train_df_subset['area_upto']
predicted_prices_upto = best_model.predict(train_df_subset)
train_df_subset['price_upto'] = predicted_prices_upto * train_df_subset['area_upto'] / 1000000

#Tính sai số
train_df_subset['price_range'] = rmse * train_df_subset['area_upto']/1000000

# Kiểm tra nếu area_from = area_upto
train_df_subset.loc[train_df_subset['area_from'] == train_df_subset['area_upto'], 'price_upto'] += train_df_subset['price_range']


# Hiển thị DataFrame mới
train_df_subset


Unnamed: 0,City,District,Ward,area_from,area_upto,median_area,Concatenated,area_value,price_from,price_upto,price_range
0,Ahmedabad,Ahmedabad,Park Hill,100.0,100.0,100.0,AhmedabadAhmedabadPark_Hill,100.0,44.451724,4236.294015,4191.842291
1,An Giang,Châu Phú,Bình Mỹ,100.0,100.0,100.0,An_GiangChau_PhuBinh_My,100.0,44.451724,4236.294015,4191.842291
2,An Giang,Long Xuyên,Mỹ Bình,3493.0,3493.0,3493.0,An_GiangLong_XuyenMy_Binh,3493.0,1261.256955,147682.308173,146421.051218
3,An Giang,Long Xuyên,Mỹ Hòa,60.0,60.0,60.0,An_GiangLong_XuyenMy_Hoa,60.0,26.671035,2541.776409,2515.105374
4,An Giang,Long Xuyên,Mỹ Xuyên,80.0,80.0,80.0,An_GiangLong_XuyenMy_Xuyen,80.0,35.561379,3389.035212,3353.473833
...,...,...,...,...,...,...,...,...,...,...,...
850,Đồng Nai,Trảng Bom,Xã Hố Nai 3,1000.0,7300.0,4000.0,Dong_NaiTrang_BomXa_Ho_Nai_3,7300.0,361.081293,2635.893436,306004.487229
851,Đồng Nai,Trảng Bom,Đông Hòa,2500.0,11125.0,8250.0,Dong_NaiTrang_BomDong_Hoa,11125.0,902.703232,4017.029381,466342.454852
852,Đồng Nai,Vĩnh Cửu,Thiện Tân,408.0,1977.0,1454.0,Dong_NaiVinh_CuuThien_Tan,1977.0,168.458097,713.857716,82872.722089
853,Đồng Nai,Vĩnh Cửu,Thạnh Phú,247.0,247.0,247.0,Dong_NaiVinh_CuuThanh_Phu,247.0,109.795759,10463.646217,10353.850458


In [82]:
test = train_df_subset[train_df_subset['Concatenated'] == 'Ho_Chi_MinhBinh_ThanhPhuong_13Dang_Thuy_Tram']
test

Unnamed: 0,City,District,Ward,area_from,area_upto,median_area,Concatenated,area_value,price_from,price_upto,price_range


In [None]:
# train_df_subset['price_from'] =train_df_subset['Predicted Price (area_from)']
# train_df_subset['price_upto'] =train_df_subset['Predicted Price (area_upto)']

In [None]:
final_prediction = train_df_subset[['City',	'District',	'Ward',	'Street','price_from',	'price_upto','median_area','Concatenated']]

In [None]:
# Kiểm tra và điều chỉnh giá trị của price_from và price_upto
final_prediction.loc[final_prediction['price_from'] > final_prediction['price_upto'], ['price_from', 'price_upto']] = final_prediction.loc[final_prediction['price_from'] > final_prediction['price_upto'], ['price_upto', 'price_from']]

# # Hiển thị DataFrame mới
# print(final_prediction)

In [None]:
final_prediction.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\final_prediction_bds.csv", index=False, mode = 'w', encoding="utf-8-sig")