In [17]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from datetime import datetime, timedelta

[nltk_data] Downloading package punkt to /Users/brendan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Generate Synthetic Financial Data
We'll create a synthetic dataset simulating earnings call transcripts and financial metrics

In [18]:
# Generate dates
np.random.seed(42)
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=x) for x in range(100)]

# Generate synthetic data
companies = ['TechCorp', 'FinanceInc', 'HealthCare', 'RetailCo', 'EnergyCo']
sectors = ['Technology', 'Finance', 'Healthcare', 'Retail', 'Energy']
speakers = ['CEO', 'CFO', 'Analyst']

# Sample earnings call snippets
call_templates = [
    "Our {} quarter showed strong growth with revenue up {}%. We're investing in {}.",
    "Despite market challenges, we achieved {} growth in our {} segment.",
    "The {} initiative has yielded positive results, with margins expanding {}%."
]

def generate_call_text():
    template = np.random.choice(call_templates)
    return template.format(
        np.random.choice(['first', 'second', 'third', 'fourth']),
        np.random.randint(5, 25),
        np.random.choice(['AI', 'cloud', 'mobile', 'sustainability'])
    )

# Create DataFrame
data = {
    'date': dates,
    'company': [np.random.choice(companies) for _ in range(100)],
    'sector': [np.random.choice(sectors) for _ in range(100)],
    'speaker': [np.random.choice(speakers) for _ in range(100)],
    'transcript': [generate_call_text() for _ in range(100)],
    'revenue': np.random.normal(1000, 200, 100),
    'growth_rate': np.random.normal(10, 3, 100),
    'sentiment_score': np.random.uniform(0, 1, 100)
}

df = pd.DataFrame(data)
df['revenue'] = np.abs(df['revenue'])  # Ensure positive revenues

print("Sample of synthetic data:")
print(df.head())

Sample of synthetic data:
        date     company      sector  speaker  \
0 2023-01-01    RetailCo      Retail      CFO   
1 2023-01-02    EnergyCo  Technology      CFO   
2 2023-01-03  HealthCare      Retail      CFO   
3 2023-01-04    EnergyCo     Finance  Analyst   
4 2023-01-05    EnergyCo  Technology  Analyst   

                                          transcript      revenue  \
0  Our fourth quarter showed strong growth with r...  1073.381883   
1  Despite market challenges, we achieved first g...  1041.899373   
2  Despite market challenges, we achieved third g...   824.887642   
3  Our first quarter showed strong growth with re...   953.030333   
4  The third initiative has yielded positive resu...   802.554158   

   growth_rate  sentiment_score  
0    12.358102         0.899667  
1     9.138580         0.624102  
2     8.876016         0.539781  
3     7.543969         0.438745  
4     8.124285         0.577486  


## Feature Engineering
Implement various feature engineering techniques for:
- Numerical features
- Categorical features
- Time-based features
- Text features

In [19]:
class FeatureEngineer:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.tfidf = TfidfVectorizer(max_features=100)
        
    def engineer_numerical_features(self, df):
        # Copy to avoid modifying original
        df_processed = df.copy()
        
        # Scale numerical features
        numerical_cols = ['revenue', 'growth_rate']
        df_processed[numerical_cols] = self.scaler.fit_transform(df_processed[numerical_cols])
        
        # Create revenue bins
        df_processed['revenue_bin'] = pd.qcut(df_processed['revenue'], 
                                            q=3, 
                                            labels=['low', 'medium', 'high'])
        
        return df_processed
    
    def engineer_categorical_features(self, df):
        # One-hot encoding for sector
        sector_dummies = pd.get_dummies(df['sector'], prefix='sector')
        
        # Label encoding for speaker
        df['speaker_encoded'] = self.label_encoder.fit_transform(df['speaker'])
        
        return pd.concat([df, sector_dummies], axis=1)
    
    def engineer_time_features(self, df):
        df['date'] = pd.to_datetime(df['date'])
        df['quarter'] = df['date'].dt.quarter
        df['month'] = df['date'].dt.month
        df['day_of_week'] = df['date'].dt.dayofweek
        
        # Add rolling means
        df['rolling_growth'] = df.groupby('company')['growth_rate'].rolling(window=3).mean().reset_index(0, drop=True)
        
        return df
    
    def engineer_text_features(self, texts):
        return self.tfidf.fit_transform(texts)

# Apply feature engineering
fe = FeatureEngineer()
df_processed = df.pipe(fe.engineer_numerical_features)\
                 .pipe(fe.engineer_categorical_features)\
                 .pipe(fe.engineer_time_features)

text_features = fe.engineer_text_features(df['transcript'])

print("\nProcessed features shape:", df_processed.shape)
print("Text features shape:", text_features.shape)


Processed features shape: (100, 19)
Text features shape: (100, 48)


## Simple Neural Network for Financial Prediction
Create a basic neural network for **predicting sentiment** based on engineered features

In [20]:
class FinancialNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.dropout(x)
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.layer3(x))
        return x

# Prepare data for PyTorch
def prepare_data(df_processed, text_features):
    # Combine numerical and one-hot encoded features
    numerical_cols = ['revenue', 'growth_rate', 'speaker_encoded']
    categorical_cols = [col for col in df_processed.columns if col.startswith('sector_')]
    
    # Convert to float32 explicitly
    numerical_features = df_processed[numerical_cols].values.astype(np.float32)
    categorical_features = df_processed[categorical_cols].values.astype(np.float32)
    text_features_array = text_features.toarray().astype(np.float32)
    
    # Combine all features
    features = np.hstack([
        numerical_features,
        categorical_features,
        text_features_array
    ])
    
    targets = df_processed['sentiment_score'].values.astype(np.float32)
    
    return torch.FloatTensor(features), torch.FloatTensor(targets)

# Prepare data
X, y = prepare_data(df_processed, text_features)
print("Feature tensor shape:", X.shape)
print("Target tensor shape:", y.shape)

# Initialize model
model = FinancialNN(input_dim=X.shape[1])
print("\nModel structure:")
print(model)

Feature tensor shape: torch.Size([100, 56])
Target tensor shape: torch.Size([100])

Model structure:
FinancialNN(
  (layer1): Linear(in_features=56, out_features=64, bias=True)
  (layer2): Linear(in_features=64, out_features=32, bias=True)
  (layer3): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


## Cross Validation Implementation
Implement both k-fold and time series cross validation

In [21]:
class CrossValidator:
    def __init__(self, model, X, y):
        self.model = model
        self.X = X
        self.y = y
        self.kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        self.time_split = TimeSeriesSplit(n_splits=5)
        
    def train_fold(self, train_idx, val_idx):
        X_train, X_val = self.X[train_idx], self.X[val_idx]
        y_train, y_val = self.y[train_idx], self.y[val_idx]
        
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters())
        
        # Simple training loop
        for epoch in range(10):  # 10 epochs per fold
            # Train
            self.model.train()
            optimizer.zero_grad()
            y_pred = self.model(X_train)
            loss = criterion(y_pred, y_train.unsqueeze(1))
            loss.backward()
            optimizer.step()
        
        # Validate
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(X_val)
            val_loss = criterion(y_pred, y_val.unsqueeze(1))
            
        return val_loss.item()
    
    def k_fold_cv(self):
        fold_scores = []
        for fold, (train_idx, val_idx) in enumerate(self.kfold.split(self.X)):
            fold_score = self.train_fold(train_idx, val_idx)
            fold_scores.append(fold_score)
            print(f"Fold {fold + 1} validation loss: {fold_score:.4f}")
        
        return np.mean(fold_scores), np.std(fold_scores)
    
    def time_series_cv(self):
        fold_scores = []
        for fold, (train_idx, val_idx) in enumerate(self.time_split.split(self.X)):
            fold_score = self.train_fold(train_idx, val_idx)
            fold_scores.append(fold_score)
            print(f"Time split {fold + 1} validation loss: {fold_score:.4f}")
        
        return np.mean(fold_scores), np.std(fold_scores)

# Run cross validation
cv = CrossValidator(model, X, y)
print("K-Fold Cross Validation:")
mean_score, std_score = cv.k_fold_cv()
print(f"\nMean validation loss: {mean_score:.4f} (±{std_score:.4f})")

print("\nTime Series Cross Validation:")
mean_score, std_score = cv.time_series_cv()
print(f"\nMean validation loss: {mean_score:.4f} (±{std_score:.4f})")

K-Fold Cross Validation:
Fold 1 validation loss: 0.0799
Fold 2 validation loss: 0.0801
Fold 3 validation loss: 0.0603
Fold 4 validation loss: 0.0582
Fold 5 validation loss: 0.0829

Mean validation loss: 0.0723 (±0.0107)

Time Series Cross Validation:
Time split 1 validation loss: 0.0536
Time split 2 validation loss: 0.0592
Time split 3 validation loss: 0.1079
Time split 4 validation loss: 0.0775
Time split 5 validation loss: 0.0822

Mean validation loss: 0.0761 (±0.0192)
