In [83]:
import os, sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# path 설정
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
print(sys.path[-1])

d:\base\boostcamp\apart\level2-competitiveds-recsys-06


In [84]:
raw_path: str = "../../data/raw"
processed_path: str = "../../data/processed"
train: pd.DataFrame = pd.read_csv(os.path.join(raw_path,"train.csv"))
test: pd.DataFrame = pd.read_csv(os.path.join(raw_path,"test.csv"))
train['_type'] = 'train'
test['_type'] = 'test'
train['original_index'] = train.index
test['original_index'] = test.index

In [85]:
# train에서 중복 제거
from src.pre_process.feature_duplicate import FeatureDuplication
train = FeatureDuplication(train).get_data()

# train, test 합치기
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# apt_idx 생성
from src.pre_process.feature_add import FeatureAddition
df = FeatureAddition(data).get_data()

# contract_ymd 생성
df['contract_ymd'] = pd.to_datetime(df['contract_year_month'].astype(str) + df['contract_day'].astype(str), format='%Y%m%d')
df

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,original_index,index,apt_idx,area,area_price,area_m2_price,contract_ymd
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,0,,0,25.8,658.914729,200.004471,2019-06-25
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,1,,0,25.8,891.472868,270.594284,2020-03-26
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,2,,0,25.8,891.472868,270.594284,2020-03-28
3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,train,3,,1,18.0,277.777778,84.260195,2019-07-15
4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0,train,4,,2,18.1,99.447514,30.095302,2019-04-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951395,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,,test,150167,150167.0,17869,35.0,,,2024-02-27
1951396,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,,test,150168,150168.0,17869,43.3,,,2024-03-02
1951397,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,,test,150169,150169.0,17869,43.3,,,2024-03-16
1951398,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,,test,150170,150170.0,17869,34.8,,,2024-03-22


In [86]:
# 2. 'apt_area_idx'를 고유 식별자로 생성 ('apt_idx'와 'area_m2' 기반)
df['apt_area_idx'] = df.groupby(['apt_idx', 'area_m2']).ngroup()

# 3. 'apt_area_idx'와 'contract_ymd' 기준으로 데이터프레임 정렬
df = df.sort_values(['apt_area_idx', 'contract_ymd']).reset_index(drop=True)

# 4. 그룹별 'deposit'의 평균과 중앙값을 계산하고, 더 높은 값을 'apt_area_deposit_criteria'에 저장
grouped = df.groupby('apt_area_idx')['deposit']
deposit_avg = grouped.transform('mean')
deposit_median = grouped.transform('median')
df['apt_area_deposit_criteria'] = np.maximum(deposit_avg, deposit_median)

# 5. 'apt_area_deposit_criteria'의 50%를 'deposit_threshold'로 설정
df['deposit_threshold'] = df['apt_area_deposit_criteria'] * 0.5

# 6. 각 그룹 내에서 최근 예치금 찾기
def find_recent_deposit(group):
    group = group.sort_values('contract_ymd')
    recent_deposits = []
    recent_dates = []
    
    for i, row in group.iterrows():
        # 현재 행 이전의 조건에 맞는 데이터 필터링
        past_data = group[(group['contract_ymd'] < row['contract_ymd']) & 
                          (group['_type'] == 'train') & 
                          (group['deposit'] >= row['deposit_threshold'])]
        
        if not past_data.empty:
            # 가장 최근의 데이터를 선택
            recent_row = past_data.iloc[-1]
            recent_deposits.append(recent_row['deposit'])
            recent_dates.append(recent_row['contract_ymd'])
        else:
            recent_deposits.append(np.nan)
            recent_dates.append(pd.NaT)
    
    group['recent_deposit'] = recent_deposits
    group['recent_contract_ymd'] = recent_dates
    
    return group

df = df.groupby('apt_area_idx').apply(find_recent_deposit).reset_index(drop=True)

# 7. 새로운 열들의 데이터 타입 설정 (이미 설정되어 있으므로 생략 가능)
df['recent_deposit'] = df['recent_deposit'].astype(float)
df['recent_contract_ymd'] = pd.to_datetime(df['recent_contract_ymd'])

# 8. (선택 사항) 중간 계산 열 삭제
df.drop(['deposit_threshold'], axis=1, inplace=True)

# df를 csv로 저장
df.to_csv(os.path.join(processed_path, "recent_deposit_for_arima2.csv"), index=False)

df

  df = df.groupby('apt_area_idx').apply(find_recent_deposit).reset_index(drop=True)


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,original_index,index,apt_idx,area,area_price,area_m2_price,contract_ymd,apt_area_idx,apt_area_deposit_criteria,recent_deposit,recent_contract_ymd
0,71.3755,202002,8,2,18,2019,37.054314,127.045216,1,20000.0,train,30397,,0,21.6,925.925926,280.208195,2020-02-08,0,29283.096774,,NaT
1,71.3755,202007,8,2,8,2019,37.054314,127.045216,1,26000.0,train,196417,,0,21.6,1203.703704,364.270653,2020-07-08,0,29283.096774,20000.0,2020-02-08
2,71.3755,202007,16,2,20,2019,37.054314,127.045216,1,31500.0,train,196419,,0,21.6,1458.333333,441.327907,2020-07-16,0,29283.096774,26000.0,2020-07-08
3,71.3755,202011,2,2,2,2019,37.054314,127.045216,1,40000.0,train,196423,,0,21.6,1851.851852,560.416389,2020-11-02,0,29283.096774,31500.0,2020-07-16
4,71.3755,202012,13,2,25,2019,37.054314,127.045216,1,35000.0,train,196425,,0,21.6,1620.370370,490.364341,2020-12-13,0,29283.096774,40000.0,2020-11-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951395,64.1000,202401,23,0,10,2015,37.499548,127.144838,9,,test,139727,139727.0,18672,19.4,,,2024-01-23,59561,,,NaT
1951396,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,,test,142914,142914.0,18673,22.9,,,2024-01-16,59562,,,NaT
1951397,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,,test,142915,142915.0,18673,22.9,,,2024-01-16,59562,,,NaT
1951398,49.9000,202403,25,0,13,2015,37.459061,126.674991,9,,test,143189,143189.0,18674,15.1,,,2024-03-25,59563,,,NaT


In [87]:
df['apt_area_deposit_criteria'].isna().sum()

np.int64(2626)

In [88]:
df.drop(columns=['index'], inplace=True)
df.head()

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,original_index,apt_idx,area,area_price,area_m2_price,contract_ymd,apt_area_idx,apt_area_deposit_criteria,recent_deposit,recent_contract_ymd
0,71.3755,202002,8,2,18,2019,37.054314,127.045216,1,20000.0,train,30397,0,21.6,925.925926,280.208195,2020-02-08,0,29283.096774,,NaT
1,71.3755,202007,8,2,8,2019,37.054314,127.045216,1,26000.0,train,196417,0,21.6,1203.703704,364.270653,2020-07-08,0,29283.096774,20000.0,2020-02-08
2,71.3755,202007,16,2,20,2019,37.054314,127.045216,1,31500.0,train,196419,0,21.6,1458.333333,441.327907,2020-07-16,0,29283.096774,26000.0,2020-07-08
3,71.3755,202011,2,2,2,2019,37.054314,127.045216,1,40000.0,train,196423,0,21.6,1851.851852,560.416389,2020-11-02,0,29283.096774,31500.0,2020-07-16
4,71.3755,202012,13,2,25,2019,37.054314,127.045216,1,35000.0,train,196425,0,21.6,1620.37037,490.364341,2020-12-13,0,29283.096774,40000.0,2020-11-02


In [89]:
test_nan_recent_deposit_count = df[(df['_type'] == 'test') & (df['recent_deposit'].isna())].shape[0]
test_nan_recent_deposit_count

2626

In [90]:
train_nan_recent_deposit_count = df[(df['_type'] == 'train') & (df['recent_deposit'].isna())].shape[0]
train_nan_recent_deposit_count

65231

In [91]:
df['recent_contract_ymd'] = pd.to_datetime(df['recent_contract_ymd'])

In [92]:
df[df['apt_area_idx']==58000]

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,original_index,apt_idx,area,area_price,area_m2_price,contract_ymd,apt_area_idx,apt_area_deposit_criteria,recent_deposit,recent_contract_ymd
1929784,95.997,202010,22,2,11,2020,37.454205,126.706936,0,49000.0,train,1666270,18025,29.1,1683.848797,510.432618,2020-10-22,58000,49000.0,,NaT
1929785,95.997,202102,20,2,29,2020,37.454205,126.706936,1,52000.0,train,1666286,18025,29.1,1786.941581,541.683594,2021-02-20,58000,49000.0,49000.0,2020-10-22
1929786,95.997,202102,27,2,10,2020,37.454205,126.706936,1,45000.0,train,1666289,18025,29.1,1546.391753,468.764649,2021-02-27,58000,49000.0,52000.0,2021-02-20
1929787,95.997,202103,25,2,27,2020,37.454205,126.706936,1,49000.0,train,1666296,18025,29.1,1683.848797,510.432618,2021-03-25,58000,49000.0,45000.0,2021-02-27
1929788,95.997,202103,29,2,20,2020,37.454205,126.706936,1,50000.0,train,1666301,18025,29.1,1718.213058,520.84961,2021-03-29,58000,49000.0,49000.0,2021-03-25
1929789,95.997,202105,13,2,19,2020,37.454205,126.706936,1,52000.0,train,1703166,18025,29.1,1786.941581,541.683594,2021-05-13,58000,49000.0,50000.0,2021-03-29
1929790,95.997,202106,5,2,26,2020,37.454205,126.706936,1,58000.0,train,1703173,18025,29.1,1993.127148,604.185547,2021-06-05,58000,49000.0,52000.0,2021-05-13
1929791,95.997,202106,5,0,26,2020,37.454205,126.706936,1,58000.0,train,1703174,18025,29.1,1993.127148,604.185547,2021-06-05,58000,49000.0,52000.0,2021-05-13
1929792,95.997,202107,8,0,29,2020,37.454205,126.706936,1,52000.0,train,1703180,18025,29.1,1786.941581,541.683594,2021-07-08,58000,49000.0,58000.0,2021-06-05
1929793,95.997,202210,8,1,12,2020,37.454205,126.706936,2,52000.0,train,1746134,18025,29.1,1786.941581,541.683594,2022-10-08,58000,49000.0,52000.0,2021-07-08


In [93]:
test_df = df[df['_type'] == 'test']
test_df['recent_deposit'].isna().sum()

np.int64(2626)

In [94]:
train_deposit_avg = df[df['_type'] == 'train']['deposit'].mean()
train_deposit_avg

np.float64(38162.229423482204)

In [103]:
# recent_deposit가 결측치인 경우 deposit 값에 임의 값 채워넣기
test_df.loc[test_df['recent_deposit'].isna(), 'deposit'] = 39000

In [104]:
test_df.isna().sum()

area_m2                           0
contract_year_month               0
contract_day                      0
contract_type                     0
floor                             0
built_year                        0
latitude                          0
longitude                         0
age                               0
deposit                           0
_type                             0
original_index                    0
apt_idx                           0
area                              0
area_price                   150172
area_m2_price                150172
contract_ymd                      0
apt_area_idx                      0
apt_area_deposit_criteria      2626
recent_deposit                 2626
recent_contract_ymd            2626
dtype: int64

In [96]:
arima_df = pd.read_csv(os.path.join(processed_path, "month_df_arima_index.csv"))
arima_df

Unnamed: 0,contract_ymd,avg_deposit,interest_rate,arima_deposit_index,sarimax_deposit_index
0,2019-04-01,30447.425958,1.85,100.000000,100.000000
1,2019-05-01,31132.991601,1.85,102.251637,102.251637
2,2019-06-01,31111.287554,1.78,102.180354,102.180354
3,2019-07-01,31621.427691,1.68,103.855832,103.855832
4,2019-08-01,31712.702769,1.52,104.155612,104.155612
...,...,...,...,...,...
58,2024-02-01,,3.62,142.092659,130.045165
59,2024-03-01,,3.59,142.295862,128.376847
60,2024-04-01,,3.54,143.308404,131.548033
61,2024-05-01,,3.56,144.355662,135.561647


In [97]:
# test_df의 전체 행 개수
total_rows = test_df.shape[0]

# deposit이 NaN인 행 개수
nan_deposit_count = test_df['deposit'].isna().sum()

total_rows, nan_deposit_count

(150172, np.int64(150172))

In [46]:
# 'arima_df'의 'contract_ymd'를 datetime 형식으로 변환
arima_df['contract_ymd'] = pd.to_datetime(arima_df['contract_ymd'])

# 연도와 월을 기준으로 'sarimax_deposit_index' 매핑 준비
sarimax_map = arima_df.set_index(arima_df['contract_ymd'].dt.to_period('M'))['sarimax_deposit_index']

# 'test_df'에 현재 시점과 최근 계약 시점의 연도-월 기반 'sarimax_deposit_index' 추가
test_df['current_period'] = test_df['contract_ymd'].dt.to_period('M')
test_df['recent_period'] = test_df['recent_contract_ymd'].dt.to_period('M')

test_df['sarimax_current'] = test_df['current_period'].map(sarimax_map)
test_df['sarimax_recent'] = test_df['recent_period'].map(sarimax_map)

# 'sarimax_current'과 'sarimax_recent'을 사용하여 조정 계수 계산
test_df['index_factor'] = test_df['sarimax_current'] / test_df['sarimax_recent']

# 'recent_deposit'을 조정하여 'adjusted_deposit' 계산
test_df['adjusted_deposit'] = test_df['recent_deposit'] * test_df['index_factor']

# 'deposit'이 NaN인 경우 'adjusted_deposit'으로 채움
test_df['deposit'] = test_df['deposit'].fillna(test_df['adjusted_deposit'])

# 불필요한 중간 컬럼 제거
test_df.drop(['current_period', 'recent_period', 'sarimax_current', 'sarimax_recent', 'index_factor', 'adjusted_deposit'], axis=1, inplace=True)

test_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['current_period'] = test_df['contract_ymd'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['recent_period'] = test_df['recent_contract_ymd'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['sarimax_current'] = test_df['current_period'].map(

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,contract_ymd,recent_deposit,apt_idx,area,area_price,area_m2_price,_type,apt_area_idx,apt_area_deposit_criteria,recent_contract_ymd
93,1813606,71.3755,202401,13,2,21,2019,37.054314,127.045216,5,25428.932926,2024-01-13,27000.0,0,21.6,-46.250000,-13.996399,test,0,29283.096774,2023-12-09
94,1813607,71.3755,202401,13,2,21,2019,37.054314,127.045216,5,25428.932926,2024-01-13,27000.0,0,21.6,-46.250000,-13.996399,test,0,29283.096774,2023-12-09
95,1813608,71.3755,202401,13,0,21,2019,37.054314,127.045216,5,25428.932926,2024-01-13,27000.0,0,21.6,-46.250000,-13.996399,test,0,29283.096774,2023-12-09
96,1815964,71.3755,202401,14,1,1,2019,37.054314,127.045216,5,25428.932926,2024-01-14,27000.0,0,21.6,-46.250000,-13.996399,test,0,29283.096774,2023-12-09
97,1841081,71.3755,202402,5,0,8,2019,37.054314,127.045216,5,24703.247335,2024-02-05,27000.0,0,21.6,-46.250000,-13.996399,test,0,29283.096774,2023-12-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951395,1827612,64.1000,202401,23,0,10,2015,37.499548,127.144838,9,39000.000000,2024-01-23,,18672,19.4,-51.494845,-15.585023,test,59561,,NaT
1951396,1818783,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,39000.000000,2024-01-16,,18673,22.9,-43.624454,-13.210791,test,59562,,NaT
1951397,1818784,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,39000.000000,2024-01-16,,18673,22.9,-43.624454,-13.210791,test,59562,,NaT
1951398,1882153,49.9000,202403,25,0,13,2015,37.459061,126.674991,9,39000.000000,2024-03-25,,18674,15.1,-66.158940,-20.020040,test,59563,,NaT


In [101]:
test_df.isna().sum()

area_m2                           0
contract_year_month               0
contract_day                      0
contract_type                     0
floor                             0
built_year                        0
latitude                          0
longitude                         0
age                               0
deposit                        2626
_type                             0
original_index                    0
apt_idx                           0
area                              0
area_price                   150172
area_m2_price                150172
contract_ymd                      0
apt_area_idx                      0
apt_area_deposit_criteria      2626
recent_deposit                 2626
recent_contract_ymd            2626
dtype: int64

In [100]:
# 'deposit'이 NaN인 경우 'recent_deposit'으로 채움
test_df['deposit'] = test_df['deposit'].fillna(test_df['recent_deposit'])
test_df.loc[test_df['recent_deposit'].isna(), 'deposit'] = 39000

test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['deposit'] = test_df['deposit'].fillna(test_df['recent_deposit'])


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,original_index,apt_idx,area,area_price,area_m2_price,contract_ymd,apt_area_idx,apt_area_deposit_criteria,recent_deposit,recent_contract_ymd
93,71.3755,202401,13,2,21,2019,37.054314,127.045216,5,27000.0,test,77299,0,21.6,,,2024-01-13,0,29283.096774,27000.0,2023-12-09
94,71.3755,202401,13,2,21,2019,37.054314,127.045216,5,27000.0,test,77300,0,21.6,,,2024-01-13,0,29283.096774,27000.0,2023-12-09
95,71.3755,202401,13,0,21,2019,37.054314,127.045216,5,27000.0,test,77301,0,21.6,,,2024-01-13,0,29283.096774,27000.0,2023-12-09
96,71.3755,202401,14,1,1,2019,37.054314,127.045216,5,27000.0,test,77302,0,21.6,,,2024-01-14,0,29283.096774,27000.0,2023-12-09
97,71.3755,202402,5,0,8,2019,37.054314,127.045216,5,27000.0,test,77305,0,21.6,,,2024-02-05,0,29283.096774,27000.0,2023-12-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951395,64.1000,202401,23,0,10,2015,37.499548,127.144838,9,,test,139727,18672,19.4,,,2024-01-23,59561,,,NaT
1951396,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,,test,142914,18673,22.9,,,2024-01-16,59562,,,NaT
1951397,75.6200,202401,16,0,3,2004,37.463234,126.642899,20,,test,142915,18673,22.9,,,2024-01-16,59562,,,NaT
1951398,49.9000,202403,25,0,13,2015,37.459061,126.674991,9,,test,143189,18674,15.1,,,2024-03-25,59563,,,NaT


In [105]:
test_frame = pd.read_csv(os.path.join(raw_path, "test.csv"))

In [49]:
# index에서 1801228을 빼서 0부터 시작하도록 설정
test_df['index'] = test_df['index'] - 1801228

# index의 min과 max값 확인
min_index = test_df['index'].min()
max_index = test_df['index'].max()

min_index, max_index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['index'] = test_df['index'] - 1801228


(np.int64(0), np.int64(150171))

In [106]:
# test_frame의 index와 test_df의 original_index를 매칭하여 deposit 값을 넣어줌
test_frame = test_frame.merge(test_df[['original_index', 'deposit']], left_on='index', right_on='original_index', how='left')

# 필요 없는 original_index 열 삭제
test_frame.drop(columns=['original_index'], inplace=True)

test_frame

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
0,0,84.961,202404,12,1,14,2016,36.965423,127.048779,8,22000.0
1,1,59.9,202404,13,0,4,1997,36.963105,127.040678,27,7900.0
2,2,39.27,202404,29,0,5,1990,36.957089,127.047449,34,6300.0
3,3,39.27,202405,3,0,1,1990,36.957089,127.047449,34,6300.0
4,4,46.98,202406,2,0,4,1990,36.957089,127.047449,34,5500.0


In [107]:
test_frame.isna().sum()

index                  0
area_m2                0
contract_year_month    0
contract_day           0
contract_type          0
floor                  0
built_year             0
latitude               0
longitude              0
age                    0
deposit                0
dtype: int64

In [109]:
# # merged_df의 index와 deposit 열만 추출
# recent_data = test_frame[['index', 'deposit']]

# # 결과를 CSV 파일로 저장
# recent_data.to_csv(os.path.join(processed_path, "submission_recent_data_2_nan_39000.csv"), index=False)

In [111]:
from sklearn.metrics import mean_absolute_error

# Load the CSV files
submission_sarimax_nan_39000 = pd.read_csv(os.path.join(processed_path, "submission_sarimax_nan_39000.csv"))
submission_arima_nan_39000 = pd.read_csv(os.path.join(processed_path, "submission_arima_nan_39000.csv"))
submission_recent_data_nan_39000 = pd.read_csv(os.path.join(processed_path, "submission_recent_data_nan_39000.csv"))
submission_recent_data_2_nan_39000 = pd.read_csv(os.path.join(processed_path, "submission_recent_data_2_nan_39000.csv"))
# Load the output.csv file
output_df = pd.read_csv(os.path.join(processed_path, "output.csv"))

# Calculate the mean of the deposit values from the three dataframes
mean_submission_sarimax_nan_39000 = submission_sarimax_nan_39000['deposit'].mean()
mean_submission_arima_nan_39000 = submission_arima_nan_39000['deposit'].mean()
mean_submission_recent_data_nan_390000 = submission_recent_data_nan_39000['deposit'].mean()
submission_recent_data_2_nan_39000 = submission_recent_data_2_nan_39000['deposit'].mean()
mean_output_df = output_df['deposit'].mean()

mean_submission_sarimax_nan_39000, mean_submission_arima_nan_39000, mean_submission_recent_data_nan_390000, submission_recent_data_2_nan_39000, mean_output_df

(np.float64(37411.0054149278),
 np.float64(40280.08875146025),
 np.float64(39388.219534933276),
 np.float64(39304.16630929867),
 np.float64(39396.78549459324))