In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Swiler's file paths for each dataset. 
gdp_data = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/GDP.csv', parse_dates=['DATE'])
cpat_tax = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/CPATAX.csv', parse_dates=['DATE'])
durable_goods = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/manufacturers_new_orders_durable_goods_excluding_defense.csv', parse_dates=['DATE'])
housing_starts = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/housing start.csv', parse_dates=['DATE'])
industrial_production = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/industrial_production_total_index.csv', parse_dates=['DATE'])
personal_consumption = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/PCECTPI.csv', parse_dates=['DATE'])
t10y2y = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/T10Y2Y.csv', parse_dates=['DATE'])
sp500_vix = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/sp500_vix_quarterly.csv', parse_dates=['Date'])
gtrend_recession = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/gtrend_recession.csv', parse_dates=['Month'])
gtrend_unemployment = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/gtrend_unemployment.csv', parse_dates=['Month'])
nonderiv_insider_activity = pd.read_csv('/Users/swilerboyd/Documents/GitHub/cs526_proj/datasets/quarterly/nonderiv_insider_activity.csv', parse_dates=['transactionDate'])

In [None]:
#Henry's file paths for each dataset.

In [5]:
# Merge datasets on their respective date columns
data = gdp_data.copy()  # Start with GDP data

# Merge other datasets using 'DATE' or appropriate columns
data = data.merge(cpat_tax, on='DATE', how='left')
data = data.merge(durable_goods, on='DATE', how='left')
data = data.merge(housing_starts, on='DATE', how='left')
data = data.merge(industrial_production, on='DATE', how='left')
data = data.merge(personal_consumption, on='DATE', how='left')
data = data.merge(t10y2y, on='DATE', how='left')
data = data.merge(sp500_vix.rename(columns={'Date': 'DATE'}), on='DATE', how='left')
data = data.merge(gtrend_recession.rename(columns={'Month': 'DATE'}), on='DATE', how='left')
data = data.merge(gtrend_unemployment.rename(columns={'Month': 'DATE'}), on='DATE', how='left')
data = data.merge(nonderiv_insider_activity.rename(columns={'transactionDate': 'DATE'}), on='DATE', how='left')
print(data.head())

        DATE      GDP  CPATAX  ADXDNO  HOUST  IPB50001SQ  PCECTPI  T10Y2Y  \
0 1947-01-01  243.164   9.959     NaN    NaN     13.7361   11.557     NaN   
1 1947-04-01  245.968  13.603     NaN    NaN     13.7450   11.649     NaN   
2 1947-07-01  249.585  13.868     NaN    NaN     13.7719   11.866     NaN   
3 1947-10-01  259.745  14.455     NaN    NaN     14.1482   12.162     NaN   
4 1948-01-01  265.742  17.971     NaN    NaN     14.2916   12.297     NaN   

   S&P 500  VIX  recession: (United States)  \
0      NaN  NaN                         NaN   
1      NaN  NaN                         NaN   
2      NaN  NaN                         NaN   
3      NaN  NaN                         NaN   
4      NaN  NaN                         NaN   

   Unemployment benefits: (United States)  net_buys  
0                                     NaN       NaN  
1                                     NaN       NaN  
2                                     NaN       NaN  
3                                     

In [6]:
# Create a recession indicator
data['GDP_diff'] = data['GDP'].diff()
data['Recession'] = np.where(data['GDP_diff'] < 0, 1, 0)

# Count the total number of recession periods
recession_count = data['Recession'].sum()

# Filter the dataset for recession periods
recession_periods = data[data['Recession'] == 1][['DATE', 'GDP']]

In [30]:
# Prepare data for modeling
# This model uses the variables that are complete 1947-2023
X = data[['CPATAX', 'PCECTPI', 'IPB50001SQ']].dropna()
y = data['Recession'].loc[X.index]  # Ensure y matches the non-NaN rows of X

# Get the dates corresponding to the non-NaN rows in X
dates = data.loc[X.index, 'DATE']  # Assuming 'DATE' is the column for the dates in your dataset

print(dates)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the model (you can use LogisticRegression or RandomForest)
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Initialize Stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validate using StratifiedKFold
cv_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')

print('Stratified k-Fold Cross-Validation Accuracy Scores:', cv_scores)
print('Mean Accuracy:', np.mean(cv_scores))



0     1947-01-01
1     1947-04-01
2     1947-07-01
3     1947-10-01
4     1948-01-01
         ...    
305   2023-04-01
306   2023-07-01
307   2023-10-01
308   2024-01-01
309   2024-04-01
Name: DATE, Length: 310, dtype: datetime64[ns]
Stratified k-Fold Cross-Validation Accuracy Scores: [0.62903226 0.58064516 0.62903226 0.62903226 0.53225806]
Mean Accuracy: 0.6


In [28]:
# Prepare data for modeling
X = data[['CPATAX', 'PCECTPI', 'IPB50001SQ', 'HOUST', 'ADXDNO', 'T10Y2Y', 'S&P 500']].dropna()
y = data['Recession'].loc[X.index]  # Ensure y matches the non-NaN rows of X

# Get the dates corresponding to the non-NaN rows in X
dates = data.loc[X.index, 'DATE']  # Assuming 'DATE' is the column for the dates in your dataset
print(dates)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the model (you can use LogisticRegression or RandomForest)
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Initialize Stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validate using StratifiedKFold
cv_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')

print('Stratified k-Fold Cross-Validation Accuracy Scores:', cv_scores)
print('Mean Accuracy:', np.mean(cv_scores))

180   1992-01-01
181   1992-04-01
182   1992-07-01
183   1992-10-01
184   1993-01-01
         ...    
305   2023-04-01
306   2023-07-01
307   2023-10-01
308   2024-01-01
309   2024-04-01
Name: DATE, Length: 130, dtype: datetime64[ns]
Stratified k-Fold Cross-Validation Accuracy Scores: [0.80769231 0.92307692 0.88461538 0.88461538 0.80769231]
Mean Accuracy: 0.8615384615384615


In [29]:
# Prepare data for modeling
X = data[['CPATAX', 'PCECTPI', 'IPB50001SQ', 'HOUST', 'ADXDNO', 'T10Y2Y','S&P 500', 'VIX', 
          'recession: (United States)', 'Unemployment benefits: (United States)', 'net_buys']].dropna()
y = data['Recession'].loc[X.index]  # Ensure y matches the non-NaN rows of X

# Get the dates corresponding to the non-NaN rows in X
dates = data.loc[X.index, 'DATE']  # Assuming 'DATE' is the column for the dates in your dataset

print(dates)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the model (you can use LogisticRegression or RandomForest)
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Initialize Stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validate using StratifiedKFold
cv_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')

print('Stratified k-Fold Cross-Validation Accuracy Scores:', cv_scores)
print('Mean Accuracy:', np.mean(cv_scores))

228   2004-01-01
229   2004-04-01
230   2004-07-01
231   2004-10-01
232   2005-01-01
         ...    
301   2022-04-01
302   2022-07-01
303   2022-10-01
304   2023-01-01
305   2023-04-01
Name: DATE, Length: 78, dtype: datetime64[ns]
Stratified k-Fold Cross-Validation Accuracy Scores: [1.         0.8125     0.8125     0.8        0.93333333]
Mean Accuracy: 0.8716666666666667
