In [17]:
import pandas as pd
import os 
from functools import reduce

In [18]:
folder_path = '/Users/AnhHuynh/Documents/CS504-006-Team2/data/monthly_data'  # Replace with your path
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

csv_dfs = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)

    # Convert to datetime
    df['observation_date'] = pd.to_datetime(df['observation_date'])

    csv_dfs.append(df)

In [19]:
house_index = pd.read_csv("data/USSTHPI-quarterly.csv")
recession_indicator = pd.read_csv("data/USRECQ-quarterly.csv")
gdp = pd.read_csv("data/GDPC1-quarterly.csv")

In [20]:
def expand_quarterly_to_monthly(df, value_col):
    """
    Expand a quarterly dataset to monthly frequency by evenly distributing the value
    over the 3 months in each quarter.

    Parameters:
        df (pd.DataFrame): Input DataFrame with 'observation_date' and value column.
        value_col (str): Name of the column containing quarterly values.

    Returns:
        pd.DataFrame: Expanded monthly DataFrame.
    """
    df['observation_date'] = pd.to_datetime(df['observation_date'])

    def expand_row(row):
        base_date = row['observation_date']
        monthly_value = row[value_col] / 3
        return pd.DataFrame({
            'observation_date': [base_date, base_date + pd.DateOffset(months=1), base_date + pd.DateOffset(months=2)],
            value_col: [monthly_value] * 3
        })

    monthly_df = pd.concat([expand_row(row) for _, row in df.iterrows()], ignore_index=True)
    monthly_df.sort_values('observation_date', inplace=True)
    monthly_df.reset_index(drop=True, inplace=True)
    return monthly_df

In [21]:
house_index_monthly= expand_quarterly_to_monthly(house_index, "USSTHPI")
recession_indicator_monthly = expand_quarterly_to_monthly(recession_indicator, "USRECQ")
gpd_monthly = expand_quarterly_to_monthly(gdp, "GDPC1")

In [22]:
T10Y3M = pd.read_csv("data/T10Y3M-dailycsv.csv")

T10Y3M['observation_date'] = pd.to_datetime(T10Y3M['observation_date'])

# Resample to monthly (you can also use 'sum', 'max', etc.)
T10Y3M_monthly = T10Y3M.resample('MS', on='observation_date').mean().reset_index()

In [23]:
all_dfs = csv_dfs + [house_index_monthly, recession_indicator_monthly, gpd_monthly, T10Y3M_monthly]

In [26]:
# --- Step 3: Merge all 8 on observation_date ---
merged_df = reduce(lambda left, right: pd.merge(left, right, on='observation_date', how='outer'), all_dfs)

# --- Step 4: Sort and export ---
merged_df.sort_values('observation_date', inplace=True)
merged_df.reset_index(drop=True, inplace=True)

# Output preview
print(merged_df.head(50))

   observation_date  HOUST  MSACSRNSA  FEDFUNDS   PCEPI     USSTHPI    USRECQ  \
0        2006-01-01   2273        5.9      4.29  83.534  122.873333  0.000000   
1        2006-02-01   2119        6.1      4.49  83.584  122.873333  0.000000   
2        2006-03-01   1969        5.1      4.59  83.746  122.873333  0.000000   
3        2006-04-01   1821        5.6      4.79  84.135  124.130000  0.000000   
4        2006-05-01   1942        5.5      4.94  84.361  124.130000  0.000000   
5        2006-06-01   1802        5.8      4.99  84.569  124.130000  0.000000   
6        2006-07-01   1737        6.9      5.24  84.858  125.150000  0.000000   
7        2006-08-01   1650        6.5      5.25  85.125  125.150000  0.000000   
8        2006-09-01   1720        7.0      5.25  84.902  125.150000  0.000000   
9        2006-10-01   1491        7.5      5.25  84.702  126.433333  0.000000   
10       2006-11-01   1570        7.8      5.25  84.731  126.433333  0.000000   
11       2006-12-01   1649  