# 1. Merge all files

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PowerTransformer,LabelEncoder,OneHotEncoder,RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# from xgboost import XGBRegressor
from datetime import datetime, timedelta
import os
import re


In [3]:
folder_path = "D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data"
file_list = os.listdir(folder_path)

merged_df = pd.DataFrame()

# Loop qua từng file CSV và merge vào DataFrame
for file in file_list:
    if "final_bds_" in file:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        merged_df = pd.concat([merged_df, df], ignore_index=True)

print(len(merged_df))

26061


In [4]:
# merged_df = merged_df.drop_duplicates()
merged_df.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final.csv", index=False, mode = 'w', encoding="utf-8-sig")

# 2. Check data 

In [5]:
df = pd.read_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final.csv")
# df.head()
print(f"The total rows of dataframe is {len(df)}")

The total rows of dataframe is 26061


In [6]:
df.head(1)

Unnamed: 0,City,District,Ward,Street,address,area_value,price_value_trieu,price_per_m2_final
0,Trà Vinh,Trà Vinh,Phường 4,Đường Chu Văn An,"Đường Chu Văn An, Phường 4, Thành phố Trà Vinh...",24.0,9.0,375000.0


In [7]:
pattern_city = r'(?i)(Thành phố|Tỉnh)\s+'
df['City'] = df['City'].str.replace(pattern_city, '', regex=True)
df['City'] = df['City'].str.normalize('NFC')  # Chuẩn hóa Unicode NFC
df['City'] = df['City'].replace('Hà Nội', 'Hà Nội', regex=True)
df['City'] = df['City'].replace('Bà Rịa-Vũng Tàu', 'Bà Rịa Vũng Tàu', regex=True)
df['City'] = df['City'].replace('Thừa Thiên-Huế', 'Thừa Thiên Huế', regex=True)
df['City']


0           Trà Vinh
1             Hà Nội
2             Hà Nội
3             Hà Nội
4             Hà Nội
            ...     
26056    Hồ Chí Minh
26057    Hồ Chí Minh
26058    Hồ Chí Minh
26059         Hà Nội
26060         Hà Nội
Name: City, Length: 26061, dtype: object

In [8]:
# Tìm kiếm các giá trị có chứa số cột District
has_number = df['District'].str.contains('\d')
df['District'] = df.apply(lambda row: re.sub(r'(?i)(Quận|Huyện|Thị Xã|Thị Trấn)\s+', '', row['District']).strip() if not has_number[row.name] else row['District'], axis=1)
df['District'] = df['District'].replace('Phan Rang - Tháp Chàm', 'Phan Rang-Tháp Chàm', regex=True)
df['District']


0            Trà Vinh
1            Cầu Giấy
2         Bắc Từ Liêm
3         Nam Từ Liêm
4          Thanh Xuân
             ...     
26056         Thủ Đức
26057        Tân Bình
26058        Tân Bình
26059    Hai Bà Trưng
26060        Cầu Giấy
Name: District, Length: 26061, dtype: object

In [9]:
# Tìm kiếm các giá trị có chứa số cột District
has_number_ward = df['Ward'].str.contains('\d')
df['Ward'] = df.apply(lambda row: re.sub(r'(?i)(Xã|Phường)\s+', '', row['Ward']).strip() if not has_number_ward[row.name] else row['Ward'], axis=1)
df['Ward']


0                Phường 4
1           Dịch Vọng Hậu
2        Phường Cổ Nhuế 1
3        Phường Mỹ Đình 1
4              Nhân Chính
               ...       
26056         Long Trường
26057            Phường 4
26058           Phường 13
26059            Vĩnh Tuy
26060           Dịch Vọng
Name: Ward, Length: 26061, dtype: object

In [10]:
pattern_street = r'(?i)(Đường|Phố|Quốc Lộ|QL.)\s+'
pattern_street2 = r'.*(Đường|Phố|Quốc Lộ|QL\.)\s+'
df['Street'] = df['Street'].str.replace(pattern_street2, '', regex=True)
df['Street'] = df['Street'].str.replace(pattern_street, '', regex=True)
df['Street'] = df['Street'].replace('Mâu Thân', 'Mậu Thân', regex=True)
df['Street'] = df['Street'].str.replace(r'uỳ', 'ùy', regex=True)
df['Street'] = df['Street'].str.replace(r'uý', 'úy', regex=True)
df['Street'] = df['Street'].str.replace(r'uỹ', 'úy', regex=True)
df['Street'] = df['Street'].str.replace(r'uỵ', 'ụy', regex=True)
df['Street'] = df['Street'].str.replace(r'uỷ', 'ủy', regex=True)
df['Street']

0            Chu Văn An
1             Xuân Thủy
2         Phạm Văn Đồng
3                   NaN
4        Hoàng Đạo Thúy
              ...      
26056        Trường Lưu
26057           Út Tịch
26058            Ấp Bắc
26059         Lạc Trung
26060               NaN
Name: Street, Length: 26061, dtype: object

In [11]:
df = df.dropna(subset=['price_per_m2_final','City','District'])
df.__len__()

25952

In [12]:
# df_center = df.loc[(df['City']=="Hà Nội") | (df['City'] == "Hồ Chí Minh")]
# df_center.__len__()

# 3. Model - test

In [13]:
df_street = df.dropna()
df_street.__len__()

16486

In [14]:
df_street.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final_clean.csv", index=False, mode = 'w', encoding="utf-8-sig")

In [15]:
df_street = pd.read_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\all_bds_final_clean.csv")


In [16]:
print(df_street.columns)
# df = df[["area_value", "price_per_m2_final", "district", "city", "Type"]]

Index(['City', 'District', 'Ward', 'Street', 'address', 'area_value',
       'price_value_trieu', 'price_per_m2_final'],
      dtype='object')


In [17]:
# df_street = df_street.dropna(subset=['price_per_m2_final','City','District','price_value_trieu'])

In [18]:
df_street['area_value'].describe()

count     16486.000000
mean        426.620614
std        4963.234328
min           1.000000
25%          70.000000
50%         142.000000
75%         320.000000
max      550000.000000
Name: area_value, dtype: float64

In [19]:
df_street = df_street[(df_street['area_value'] > 10) & (df_street['area_value'] < 400)]

In [20]:
df_street['price_per_m2_final'].describe()

count    1.258500e+04
mean     1.192820e+06
std      5.529018e+07
min      8.333333e+03
25%      1.923077e+05
50%      3.500000e+05
75%      5.200000e+05
max      6.000000e+09
Name: price_per_m2_final, dtype: float64

In [21]:
df_street = df_street[(df_street['price_per_m2_final'] > 1000) & (df_street['price_per_m2_final'] < 10000000)]

In [22]:
df_street.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12567 entries, 0 to 16484
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   City                12567 non-null  object 
 1   District            12567 non-null  object 
 2   Ward                12567 non-null  object 
 3   Street              12567 non-null  object 
 4   address             12567 non-null  object 
 5   area_value          12567 non-null  float64
 6   price_value_trieu   12567 non-null  float64
 7   price_per_m2_final  12567 non-null  float64
dtypes: float64(3), object(5)
memory usage: 883.6+ KB


In [23]:
X = df_street.drop('price_per_m2_final', axis=1)
y = df_street['price_per_m2_final']

# X2 = df.drop('price_per_m2_final', axis=1)
# y2 = df['price_per_m2_final']

# X3 = df_center.drop('price_per_m2_final', axis=1)
# y3 = df_center['price_per_m2_final']

# X4 = df_street.drop('price_value_trieu', axis=1)
# y4 = df_street['price_value_trieu']


In [24]:
# best_random_state = None
# best_score = float('-inf')

# # Repeat with different random_state values
# for random_state in range(100):
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)
    
#     # Define preprocessing steps
#     numeric_features = ['area_value']
#     categorical_features = ['City', 'District', 'Ward','Street']
#     # categorical_features = ['City', 'District', 'Ward']
#     numeric_transformer = Pipeline(steps=[('scaler', RobustScaler())])
#     categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
#     preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
#                                                    ('cat', categorical_transformer, categorical_features)])
    
#     # Build the pipeline
#     model = Pipeline(steps=[('preprocessor', preprocessor),
#                             ('regressor', LinearRegression())])
    
#     # Train the model
#     model.fit(X_train, y_train)
    
#     # Evaluate performance
#     y_pred = model.predict(X_test)
#     score = r2_score(y_test, y_pred)

#     # Update best random_state if needed
#     if score > best_score:
#         best_score = score
#         best_random_state = random_state
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = mse ** 0.5

# print("Best random_state:", best_random_state)
# print("Best R-squared score:", best_score)
# print("Best RMSE score:", rmse)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=21)

#Tranform lại data 
numeric_features = ['area_value']
categorical_features = ['City', 'District', 'Ward','Street']

numeric_transformer = Pipeline(steps=[('scaler', RobustScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [26]:
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score
# # list các mô hình được lựa chọn
# models = [LinearRegression(), RandomForestRegressor(), DecisionTreeRegressor(), XGBRegressor(), GradientBoostingRegressor()]


# # Xác định KFold
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=7)

# all_scores = []
# # Đánh giá toàn bộ các mô hình trên tập K-Fold đã chia
# for check in models:
#   completed_pl = Pipeline(
#     steps=[("preprocessor", preprocessor), ("regressor", check)]
#   )

#   mse_scores = -cross_val_score(completed_pl, X_train, y_train, cv=cv, n_jobs=-1,scoring='neg_mean_squared_error')
#   # mse = mean_squared_error(y_test, y_pred)
#   rmse = np.sqrt(mse_scores)
#   all_scores.append(rmse)

In [27]:
# # Draw bboxplot 
# model_names = ['LinearRegression', 'RandomForestRegressor', 'DecisionTreeRegressor', 'XGBRegressor', 'GradientBoostingRegressor']

# plt.figure(figsize=(8, 4))
# plt.boxplot(all_scores)
# plt.xlabel('Model', fontsize=8)
# plt.ylabel('RMSE', fontsize=8)
# plt.xticks(np.arange(len(model_names))+1, model_names, rotation=90, fontsize=8)
# plt.title("Scores Metrics", fontsize=18)

In [28]:
# # Xây dựng pipeline với mô hình Regression
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('regressor', RandomForestRegressor())])

# model.fit(X_train, y_train)

# score = model.score(X_test, y_test)
# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# rmse = mse ** 0.5

# print(f'R^2 Score: {score}')
# print("RMSE score:", rmse)

In [29]:
from sklearn.model_selection import GridSearchCV

# Define pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

# Define param grid
param_grid = {
    'regressor__n_estimators': [100, 200], #Số cây trong mô hình randomforest
    'regressor__max_depth': [None, 10, 20], #Độ sâu mỗi cây
    'regressor__min_samples_split': [2, 20], #Số lượng mẫu tối thiểu trong 1 nút cây
    'regressor__min_samples_leaf': [1, 2] #Số lượng mẫu tối thiểu mà một lá cây phải có
    }

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)


# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)

# Use best model to predict on test set
best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test) 
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'R^2 Score: {score}')
print("RMSE score:", rmse)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 20, 'regressor__n_estimators': 200}
R^2 Score: 0.5472841807693252
RMSE score: 251318.90384228269


In [30]:
model

In [31]:
#Kiểm tra mô hình bằng k-fold cross valid
from sklearn.model_selection import cross_val_score, KFold

# Define the KFold cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=21)

# Perform cross-validation on the model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=kf)

# Calculate the mean and standard deviation of the cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print("Mean Cross-Validation Score:", mean_cv_score)
print("Standard Deviation of Cross-Validation Scores:", std_cv_score)

Mean Cross-Validation Score: 0.43323348518488614
Standard Deviation of Cross-Validation Scores: 0.03881456163167712


In [32]:
import pickle

# Lưu mô hình vào file
model_filename_pickle = r'D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\model\RandomForestRegressor_rental_cost.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(best_model, file)

# Để sử dụng mô hình từ file pickle
with open(model_filename_pickle, 'rb') as file:
    loaded_model_pickle = pickle.load(file)


In [33]:
new_data_1 = {
            'area_value': [10],
            'City': ['Hà Nội'],
            'District': ['Tây Hồ'],
            'Ward': [''],
            'Street': ['']
            }

new_df_1 = pd.DataFrame(new_data_1)

new_prediction_pickle = loaded_model_pickle.predict(new_df_1)
print(f'Predicted Price per m2 using loaded pickle model: {new_prediction_pickle}')

Predicted Price per m2 using loaded pickle model: [447228.05468666]


In [34]:
from unidecode import unidecode

ModuleNotFoundError: No module named 'unidecode'

In [None]:
train_groupby = X_train[['City', 'District', 'Ward', 'Street','area_value']]
train_groupby_no_area = train_groupby.rename(columns={'area_value': 'median_area'})
train_groupby_no_area = train_groupby_no_area.groupby(['City', 'District', 'Ward', 'Street']).median().reset_index()
train_groupby_no_area['Concatenated'] = train_groupby_no_area[['City', 'District', 'Ward', 'Street']].apply(
    lambda row: ''.join(unidecode(str(val)).replace(' ', '_') for val in row), axis=1
)
train_groupby_no_area

Unnamed: 0,City,District,Ward,Street,median_area,Concatenated
0,An Giang,Châu Phú,Bình Mỹ,Lê Đại Hành,100.0,An_GiangChau_PhuBinh_MyLe_Dai_Hanh
1,An Giang,Long Xuyên,Mỹ Bình,Tôn Đức Thắng,112.0,An_GiangLong_XuyenMy_BinhTon_Duc_Thang
2,An Giang,Long Xuyên,Mỹ Hòa,Hà Hoàng Hổ,60.0,An_GiangLong_XuyenMy_HoaHa_Hoang_Ho
3,An Giang,Long Xuyên,Mỹ Xuyên,Hà Hoàng Hổ,80.0,An_GiangLong_XuyenMy_XuyenHa_Hoang_Ho
4,Bà Rịa Vũng Tàu,Bà Rịa,Long Tâm,Hoàng Hoa Thám,151.0,Ba_Ria_Vung_TauBa_RiaLong_TamHoang_Hoa_Tham
...,...,...,...,...,...,...
2112,Đồng Nai,Biên Hòa,Tân Phong,Đồng Khởi,100.0,Dong_NaiBien_HoaTan_PhongDong_Khoi
2113,Đồng Nai,Biên Hòa,Tân Tiến,Lưu Văn Viết,104.0,Dong_NaiBien_HoaTan_TienLuu_Van_Viet
2114,Đồng Nai,Biên Hòa,Tân Tiến,Nguyễn Ái Quốc,47.5,Dong_NaiBien_HoaTan_TienNguyen_Ai_Quoc
2115,Đồng Nai,Biên Hòa,Tân Tiến,Phan Trung,246.0,Dong_NaiBien_HoaTan_TienPhan_Trung


In [None]:
train_groupby_percentiles = train_groupby.groupby(['City', 'District', 'Ward', 'Street']).describe().reset_index()
train_groupby_percentiles['Concatenated'] = train_groupby_percentiles[['City', 'District', 'Ward', 'Street']].apply(
    lambda row: ''.join(unidecode(str(val)).replace(' ', '_') for val in row), axis=1
)
# train_groupby_percentiles['Concatenated'] = train_groupby_percentiles['Concatenated'].str.replace(' ','')
train_groupby_percentiles

Unnamed: 0_level_0,City,District,Ward,Street,area_value,area_value,area_value,area_value,area_value,area_value,area_value,area_value,Concatenated
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,count,mean,std,min,25%,50%,75%,max,Unnamed: 13_level_1
0,An Giang,Châu Phú,Bình Mỹ,Lê Đại Hành,1.0,100.000000,,100.0,100.00,100.0,100.00,100.0,An_GiangChau_PhuBinh_MyLe_Dai_Hanh
1,An Giang,Long Xuyên,Mỹ Bình,Tôn Đức Thắng,1.0,112.000000,,112.0,112.00,112.0,112.00,112.0,An_GiangLong_XuyenMy_BinhTon_Duc_Thang
2,An Giang,Long Xuyên,Mỹ Hòa,Hà Hoàng Hổ,1.0,60.000000,,60.0,60.00,60.0,60.00,60.0,An_GiangLong_XuyenMy_HoaHa_Hoang_Ho
3,An Giang,Long Xuyên,Mỹ Xuyên,Hà Hoàng Hổ,1.0,80.000000,,80.0,80.00,80.0,80.00,80.0,An_GiangLong_XuyenMy_XuyenHa_Hoang_Ho
4,Bà Rịa Vũng Tàu,Bà Rịa,Long Tâm,Hoàng Hoa Thám,2.0,151.000000,26.870058,132.0,141.50,151.0,160.50,170.0,Ba_Ria_Vung_TauBa_RiaLong_TamHoang_Hoa_Tham
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2112,Đồng Nai,Biên Hòa,Tân Phong,Đồng Khởi,5.0,96.200000,9.066422,80.0,100.00,100.0,100.00,101.0,Dong_NaiBien_HoaTan_PhongDong_Khoi
2113,Đồng Nai,Biên Hòa,Tân Tiến,Lưu Văn Viết,1.0,104.000000,,104.0,104.00,104.0,104.00,104.0,Dong_NaiBien_HoaTan_TienLuu_Van_Viet
2114,Đồng Nai,Biên Hòa,Tân Tiến,Nguyễn Ái Quốc,2.0,47.500000,24.748737,30.0,38.75,47.5,56.25,65.0,Dong_NaiBien_HoaTan_TienNguyen_Ai_Quoc
2115,Đồng Nai,Biên Hòa,Tân Tiến,Phan Trung,5.0,226.200000,70.194017,145.0,160.00,246.0,280.00,300.0,Dong_NaiBien_HoaTan_TienPhan_Trung


In [None]:

# Chọn chỉ mục cho các percentiles 20 và 80
percentiles_from = train_groupby_percentiles.xs('min', level=1, axis=1).rename(columns={'area_value': 'area_from'})
percentiles_upto = train_groupby_percentiles.xs('75%', level=1, axis=1).rename(columns={'area_value': 'area_upto'})

# Gộp các percentiles vào một DataFrame mới
percentiles_df = pd.concat([percentiles_from, percentiles_upto,train_groupby_percentiles[['Concatenated']]], axis=1)
percentiles_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col.strip() for col in percentiles_df.columns.values]
percentiles_df.columns = [col[:-1] if col.endswith('_') and col != 'Concatenated' else col for col in percentiles_df.columns]

# Hiển thị DataFrame mới
percentiles_df

Unnamed: 0,area_from,area_upto,Concatenated
0,100.0,100.00,An_GiangChau_PhuBinh_MyLe_Dai_Hanh
1,112.0,112.00,An_GiangLong_XuyenMy_BinhTon_Duc_Thang
2,60.0,60.00,An_GiangLong_XuyenMy_HoaHa_Hoang_Ho
3,80.0,80.00,An_GiangLong_XuyenMy_XuyenHa_Hoang_Ho
4,132.0,160.50,Ba_Ria_Vung_TauBa_RiaLong_TamHoang_Hoa_Tham
...,...,...,...
2112,80.0,100.00,Dong_NaiBien_HoaTan_PhongDong_Khoi
2113,104.0,104.00,Dong_NaiBien_HoaTan_TienLuu_Van_Viet
2114,30.0,56.25,Dong_NaiBien_HoaTan_TienNguyen_Ai_Quoc
2115,145.0,280.00,Dong_NaiBien_HoaTan_TienPhan_Trung


In [None]:
train_groupby_percentiles = pd.merge(train_groupby_no_area,percentiles_df, on='Concatenated')
train_groupby_percentiles = train_groupby_percentiles[['City','District','Ward','Street','area_from','area_upto','median_area','Concatenated']]
train_groupby_percentiles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2117 entries, 0 to 2116
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City          2117 non-null   object 
 1   District      2117 non-null   object 
 2   Ward          2117 non-null   object 
 3   Street        2117 non-null   object 
 4   area_from     2117 non-null   float64
 5   area_upto     2117 non-null   float64
 6   median_area   2117 non-null   float64
 7   Concatenated  2117 non-null   object 
dtypes: float64(3), object(5)
memory usage: 148.9+ KB


In [None]:
# Tạo DataFrame mới với các cột 'City', 'District', 'Ward', 'Street' từ train_groupby_percentiles
train_df_subset = train_groupby_percentiles[['City','District','Ward','Street','area_from','area_upto','median_area','Concatenated']].copy()

# Dự đoán giá trị cho area_value min
train_df_subset['area_value'] = train_df_subset['area_from']
predicted_prices_from = best_model.predict(train_df_subset)
train_df_subset['price_from'] = predicted_prices_from  * train_df_subset['area_from'] / 1000000

# Dự đoán giá trị cho area_value max
train_df_subset['area_value'] = train_df_subset['area_upto']
predicted_prices_upto = best_model.predict(train_df_subset)
train_df_subset['price_upto'] = predicted_prices_upto * train_df_subset['area_upto'] / 1000000

#Tính sai số
train_df_subset['price_range'] = rmse * train_df_subset['area_upto']/1000000

# Kiểm tra nếu area_from = area_upto
train_df_subset.loc[train_df_subset['area_from'] == train_df_subset['area_upto'], 'price_upto'] += train_df_subset['price_range']


# Hiển thị DataFrame mới
train_df_subset


Unnamed: 0,City,District,Ward,Street,area_from,area_upto,median_area,Concatenated,area_value,price_from,price_upto,price_range
0,An Giang,Châu Phú,Bình Mỹ,Lê Đại Hành,100.0,100.00,100.0,An_GiangChau_PhuBinh_MyLe_Dai_Hanh,100.00,14.361003,39.500733,25.139729
1,An Giang,Long Xuyên,Mỹ Bình,Tôn Đức Thắng,112.0,112.00,112.0,An_GiangLong_XuyenMy_BinhTon_Duc_Thang,112.00,17.661458,45.817955,28.156497
2,An Giang,Long Xuyên,Mỹ Hòa,Hà Hoàng Hổ,60.0,60.00,60.0,An_GiangLong_XuyenMy_HoaHa_Hoang_Ho,60.00,8.051264,23.135102,15.083838
3,An Giang,Long Xuyên,Mỹ Xuyên,Hà Hoàng Hổ,80.0,80.00,80.0,An_GiangLong_XuyenMy_XuyenHa_Hoang_Ho,80.00,16.988054,37.099838,20.111783
4,Bà Rịa Vũng Tàu,Bà Rịa,Long Tâm,Hoàng Hoa Thám,132.0,160.50,151.0,Ba_Ria_Vung_TauBa_RiaLong_TamHoang_Hoa_Tham,160.50,26.789284,30.521667,40.349265
...,...,...,...,...,...,...,...,...,...,...,...,...
2112,Đồng Nai,Biên Hòa,Tân Phong,Đồng Khởi,80.0,100.00,100.0,Dong_NaiBien_HoaTan_PhongDong_Khoi,100.00,13.832549,16.745525,25.139729
2113,Đồng Nai,Biên Hòa,Tân Tiến,Lưu Văn Viết,104.0,104.00,104.0,Dong_NaiBien_HoaTan_TienLuu_Van_Viet,104.00,16.423962,42.569280,26.145318
2114,Đồng Nai,Biên Hòa,Tân Tiến,Nguyễn Ái Quốc,30.0,56.25,47.5,Dong_NaiBien_HoaTan_TienNguyen_Ai_Quoc,56.25,9.644641,14.502789,14.141098
2115,Đồng Nai,Biên Hòa,Tân Tiến,Phan Trung,145.0,280.00,246.0,Dong_NaiBien_HoaTan_TienPhan_Trung,280.00,29.196161,44.987131,70.391242


In [None]:
test = train_df_subset[train_df_subset['Concatenated'] == 'Ho_Chi_MinhBinh_ThanhPhuong_13Dang_Thuy_Tram']
test

Unnamed: 0,City,District,Ward,Street,area_from,area_upto,median_area,Concatenated,area_value,price_from,price_upto,price_range
899,Hồ Chí Minh,Bình Thạnh,Phường 13,Đặng Thùy Trâm,77.0,131.75,113.5,Ho_Chi_MinhBinh_ThanhPhuong_13Dang_Thuy_Tram,131.75,19.297826,32.10073,33.121593


In [None]:
# train_df_subset['price_from'] =train_df_subset['Predicted Price (area_from)']
# train_df_subset['price_upto'] =train_df_subset['Predicted Price (area_upto)']

In [None]:
final_prediction = train_df_subset[['City',	'District',	'Ward',	'Street','price_from',	'price_upto','median_area','Concatenated']]

In [None]:
# Kiểm tra và điều chỉnh giá trị của price_from và price_upto
final_prediction.loc[final_prediction['price_from'] > final_prediction['price_upto'], ['price_from', 'price_upto']] = final_prediction.loc[final_prediction['price_from'] > final_prediction['price_upto'], ['price_upto', 'price_from']]

# # Hiển thị DataFrame mới
# print(final_prediction)

In [None]:
final_prediction.to_csv(r"D:\OneDrive\KiotViet\Python_for_work\KFinance\sondn_kfinance\Model_Prediction_RentalCost\CSV_crawl_data\final_prediction_bds.csv", index=False, mode = 'w', encoding="utf-8-sig")