In [11]:
import pandas as pd

# Load the data from the uploaded CSV file
data_path = 'out.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataset and its structure
#data.head(), data.info()


In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

data['mood_shifted'] = data['mood'].shift(1)
data.sort_values('date', inplace=True)

features = data.drop(['mood', 'id', 'screen', 'Unnamed: 0', 'mood_quantiles', 'date'], axis=1)

target = data['mood']
print(features.head(), features.info())

# Normalize the features
scaler = StandardScaler()
#features_scaled = scaler.fit_transform(features)
features_scaled = scaler.fit_transform(features.fillna(features.mean())) # Function to create sequences


# Convert to numpy arrays
features_np = np.array(features_scaled)
target_np = np.array(target)

# Create sequences for LSTM
def create_sequences(features, target, window_size):
    X, y = [], []
    for i in range(len(features) - window_size):
        X.append(features[i:(i + window_size)])
        y.append(target[i + window_size])
    return np.array(X), np.array(y)

# Assuming we use a window size of 10 days
window_size = 10
X, y = create_sequences(features_np, target_np, window_size)



<class 'pandas.core.frame.DataFrame'>
Index: 1268 entries, 0 to 877
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   activity                      1268 non-null   float64
 1   circumplex.arousal            1268 non-null   float64
 2   circumplex.valence            1268 non-null   float64
 3   appCat.builtin                1268 non-null   float64
 4   appCat.communication          1268 non-null   float64
 5   appCat.entertainment          1268 non-null   float64
 6   appCat.finance                1268 non-null   float64
 7   appCat.game                   1268 non-null   float64
 8   appCat.office                 1268 non-null   float64
 9   appCat.other                  1268 non-null   float64
 10  appCat.social                 1268 non-null   float64
 11  appCat.travel                 1268 non-null   float64
 12  appCat.unknown                1268 non-null   float64
 13  appCat.ut

In [13]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split, Subset
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Convert numpy arrays to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Create a TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the LSTM model class
class LSTMModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1, num_layers=1):
        super(LSTMModel, self).__init__()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.linear(lstm_out[:, -1, :])
        return out

def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            y_pred = y_pred.squeeze(-1)  # Squeeze the prediction to remove the extra dimension
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

# Example function to evaluate the model
def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            y_pred = model(X_batch)
            y_pred = y_pred.squeeze(-1)  # Squeeze the prediction to remove the extra dimension
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
    return total_loss / len(val_loader)

# Model parameters (you need to define these correctly based on your problem)
input_dim = 21
hidden_dim = 50
output_dim = 1
batch_size = 32

# LSTM model instance
model = LSTMModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

fold = 0
for train_index, val_index in tscv.split(X_tensor):
    train_dataset = Subset(dataset, train_index)
    val_dataset = Subset(dataset, val_index)

    # Create DataLoaders for training and validation sets
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    print(f"Training on {len(train_dataset)} samples, validating on {len(val_dataset)} samples.")

    # Train the model
    train_model(model, train_loader, criterion, optimizer)

    # Evaluate the model on the validation set
    validation_loss = evaluate(model, val_loader, criterion)
    print(f"Validation loss for fold {fold}: {validation_loss}")
    fold += 1


Training on 213 samples, validating on 209 samples.
Validation loss for fold 0: 2.068376302719116
Training on 422 samples, validating on 209 samples.
Validation loss for fold 1: 0.6382670402526855
Training on 631 samples, validating on 209 samples.
Validation loss for fold 2: 0.6080692623342786
Training on 840 samples, validating on 209 samples.
Validation loss for fold 3: 0.6002697135720935
Training on 1049 samples, validating on 209 samples.
Validation loss for fold 4: 0.4795581187520708
