In [None]:
import pandas as pd
import numpy as np
import re
import os
from tqdm.notebook import tqdm

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
data_df = pd.read_hdf("./data/in/title_script_summary_genres_score.h5")
data_df = data_df[pd.to_numeric(data_df['meta_score'], errors='coerce').notnull()]
data_df['meta_score'] = pd.to_numeric(data_df['meta_score'])
#data_df = data_df[data_df['title'] != 'American Outlaws']

In [None]:
def clean_list(lst):
    return [item for item in lst if item is not None and item != '' and re.search('[a-öA-ö]', item)]

In [None]:
data_df['split_script'] = data_df['script'].str.split(r'(?=\n[^a-ö]+(\n|\:\s+))')
data_df['split_script'] = data_df['split_script'].apply(clean_list)

In [None]:
remove_list = []
for index, data in data_df.iterrows():
    for line in data['split_script']:
        if len(line.split()) > 512:
            remove_list.append(data['title'])

In [None]:
data_df = data_df[~data_df['title'].isin(remove_list)]

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                        torch_dtype=torch.bfloat16,attn_implementation="flash_attention_2").to(torch.device('cuda'))

In [None]:
def encode(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze()  # Taking the first token ([CLS]) embedding

def encode_script(row, model, tokenizer, device):
    vector_list = []
    for line in row['split_script']:
        vector = encode(line, model, tokenizer, device)
        vector_list.append(vector.cpu().numpy())  # Move vectors to CPU to save GPU memory
        torch.cuda.empty_cache()  # Frees up unutilized GPU memory
    row['script_vectors'] = vector_list
    return row

In [None]:
# Assuming data_df, model, and tokenizer are defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.half()  # Convert model to half precision

tqdm.pandas()

In [None]:
vectorised_data_df = data_df.progress_apply(lambda x: encode_script(x, model, tokenizer, device), axis=1)

In [None]:
#vectorised_data_df.to_feather('vectorised_data_df.feather')

In [None]:
vectorised_data_df = pd.read_feather('vectorised_data_df.feather') 

In [None]:
vectorised_data_df['title']

In [None]:
script_series = vectorised_data_df[vectorised_data_df['title'] == 'Matrix, The'].squeeze(axis=0)
#sum_vector = script_series['script_vectors'].sum()
weights = [1]*len(script_series['script_vectors'])
weights[1] = 0
weights[2] = 0
weights[3] = 0
weighted_average_vector = np.average(script_series['script_vectors'], weights=weights)
redundant_vector = script_series['script_vectors'][[1, 2, 3]].mean()

In [None]:
script_series['split_script'][5]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
line_sim_dict = {}
for line, line_vector in list(zip(script_series['split_script'], script_series['script_vectors'])):
    average_sim = cosine_similarity(line_vector.reshape(1, -1), weighted_average_vector.reshape(1, -1)).item()
    redundant_sim = cosine_similarity(line_vector.reshape(1, -1), redundant_vector.reshape(1, -1)).item()
    line_sim_dict[line] = average_sim/redundant_sim

In [None]:
sorted(line_sim_dict.items(), key = lambda x:x[1], reverse=True)

In [None]:
X = vectorised_data_df['script_vectors']
y = vectorised_data_df['meta_score']

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
torch.cuda.is_available()

In [None]:
# Ensure each inner array is a properly typed NumPy array (not of type object)
standardized_X = [np.vstack(seq).astype(np.float32) for seq in X]

# Convert each NumPy array to a PyTorch tensor
tensor_X = [torch.tensor(seq) for seq in standardized_X]

# Pad sequences
padded_X = pad_sequence(tensor_X, batch_first=True, padding_value=0.0)

In [None]:
data = padded_X

In [None]:
import h5py
def create_windows(data, window_size):
    """Generate windows for a single sequence."""
    n_windows = data.shape[0] - window_size + 1
    return torch.stack([data[i:i+window_size] for i in range(n_windows)])

def expand_labels(labels, window_size, sequence_length):
    """Expand labels to match the number of windows per sequence."""
    n_windows_per_sequence = sequence_length - window_size + 1
    return labels.repeat_interleave(n_windows_per_sequence)

def batched_window_creation_and_save(data, window_size, save_dir, batch_size=10):
    """Process data in batches, create windows, and save to disk in binary format."""
    os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists or create it if it doesn't
    num_sequences = data.shape[0]
    
    for start_idx in tqdm(range(0, num_sequences, batch_size)):
        end_idx = min(start_idx + batch_size, num_sequences)
        batch_data = data[start_idx:end_idx]
        all_windows = []

        for seq in batch_data:
            windows = create_windows(seq, window_size)  # Assuming you have a function create_windows
            all_windows.append(windows)

        if all_windows:
            batch_windows_tensor = torch.cat(all_windows, dim=0)
            # Save batch tensor to binary file
            file_path = os.path.join(save_dir, f'batch_{start_idx//batch_size}.bin')
            batch_windows_tensor.numpy().tofile(file_path)

def batched_window_creation_and_save_compressed(data, window_size, save_dir, batch_size=10):
    """Process data in batches, create windows, and save to disk in HDF5 format with compression."""
    os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists or create it if it doesn't
    num_sequences = data.shape[0]
    
    # Open an HDF5 file
    hdf5_file = os.path.join(save_dir, 'windows.h5')
    with h5py.File(hdf5_file, 'w') as file:
        for start_idx in tqdm(range(0, num_sequences, batch_size)):
            end_idx = min(start_idx + batch_size, num_sequences)
            batch_data = data[start_idx:end_idx]
            all_windows = []

            for seq in batch_data:
                windows = create_windows(seq, window_size)
                all_windows.append(windows)

            if all_windows:
                batch_windows_tensor = torch.cat(all_windows, dim=0)
                # Create a dataset in the HDF5 file with compression and chunking
                dataset_name = f'batch_{start_idx//batch_size}'
                chunks = (min(batch_windows_tensor.shape[0], 100), window_size, feature_dim)
                file.create_dataset(dataset_name, 
                                    data=batch_windows_tensor.numpy(), 
                                    compression="lzf",  # Change compression algorithm as needed
                                    chunks=chunks)  # Specify chunk size

def batched_window_creation_and_save_compressed_no_nulls(data, labels, window_size, save_dir, batch_size=10):
    """Process data in batches, create windows, and save to disk in HDF5 format with compression,
       skipping windows that are all zeros and their corresponding labels."""
    os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists or create it if it doesn't
    num_sequences = data.shape[0]
    global_label_index = 0  # Initialize label index outside the batch loop
    
    hdf5_file = os.path.join(save_dir, 'windows.h5')
    with h5py.File(hdf5_file, 'w') as file:
        for start_idx in tqdm(range(0, num_sequences, batch_size)):
            end_idx = min(start_idx + batch_size, num_sequences)
            batch_data = data[start_idx:end_idx]
            all_windows = []
            valid_labels = []

            for seq in batch_data:
                windows = create_windows(seq, window_size)
                for window in windows:
                    if torch.any(window != 0):
                        all_windows.append(window)
                        if global_label_index < labels.size(0):  # Ensure we do not go out of bounds
                            label = labels[global_label_index].to(torch.int64)
                            valid_labels.append(label)
                        global_label_index += 1  # Increment only for non-zero windows

            if all_windows:
                batch_windows_tensor = torch.stack(all_windows)
                valid_labels_tensor = torch.tensor(valid_labels, dtype=torch.int64)
                dataset_name = f'batch_{start_idx // batch_size}_data'
                labelset_name = f'batch_{start_idx // batch_size}_labels'
                chunks_data = (min(batch_windows_tensor.shape[0], 100), window_size, batch_data.shape[-1])
                chunks_labels = (min(valid_labels_tensor.shape[0], 100),)
                file.create_dataset(dataset_name,
                                    data=batch_windows_tensor.numpy(),
                                    compression="lzf",
                                    chunks=chunks_data)  # Specify chunk size for data
                file.create_dataset(labelset_name,
                                    data=valid_labels_tensor.numpy(),
                                    compression="lzf",
                                    chunks=chunks_labels)  # Specify chunk size for labels

    print(f"Finished processing. Total non-zero windows processed: {len(all_windows)}. Total labels used: {global_label_index}.")

In [None]:
# Example usage:
num_sequences = 863
sequence_length = 5288  # Length of each sequence
feature_dim = 768 # Number of features per time step
window_size = 60  # Define your window size
save_dir = r'D:\saved_windows_compressed_no_all_nulls'  # Define the directory to save the windows



In [None]:
# Create labels
labels = torch.tensor(y.to_numpy())

# Expand labels to match the number of windows per original sequence
expanded_labels = expand_labels(labels, window_size, sequence_length)

In [None]:
expanded_labels.shape

In [None]:

# Create windows and save them to disk
batched_window_creation_and_save_compressed_no_nulls(data, expanded_labels, window_size, save_dir)


In [None]:
import pickle

In [None]:

from hdf5_dataset import HDF5Dataset

dataset = HDF5Dataset(save_dir+'\windows.h5', batch_ratio=0.25)
try:
    pickle.dumps(dataset)
    print("Dataset is picklable!")
except pickle.PicklingError as e:
    print("Dataset is not picklable:", e)

In [None]:
len(dataset)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

In [None]:
%%time
#targets = torch.tensor(y.to_numpy())  # Dummy target labels
#targets = expanded_labels
#print(padded_X_windowed.shape, targets.shape)
# Create a TensorDataset
#dataset = TensorDataset(padded_X_windowed, targets)



# Total number of samples in your dataset
n_samples = len(dataset)

# Define split sizes
train_size = int(n_samples * 0.7)
val_size = int(n_samples * 0.2)
test_size = n_samples - train_size - val_size  # Ensures all samples are used

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# You can now create DataLoader instances to easily batch your data during training/testing
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

# Example: Access a batch from train_loader
for data, labels in train_loader:
    print(data.shape)
    print('Data:', data)
    print('Labels:', labels)
    break  # Only show one batch for brevity

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MultiLayerLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, bidirectional=False):
        super(MultiLayerLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=bidirectional)
        # Update the classifier to handle bidirectional output if needed
        factor = 2 if bidirectional else 1
        self.classifier = nn.Linear(hidden_dim * factor, 1)  # Output dimension is 1
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # Select the last time step from each sequence
        if self.lstm.bidirectional:
            # Concatenate the hidden states from the last and first time step for bidirectional
            last_time_step = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size], 
                                        lstm_out[:, 0, self.lstm.hidden_size:]), dim=-1)
        else:
            last_time_step = lstm_out[:, -1, :]
        out = self.classifier(last_time_step)
        # Scale the output to be within the range 0 to 100
        #out = 100 * torch.sigmoid(out)  # Using sigmoid to normalize output between 0 and 1, then scale to 0-100
        return out

In [None]:
# Assuming input_size, hidden_size, and output_size are known
input_size = 768  # This depends on your input feature size
hidden_size = 500  # You can adjust this size
hidden_layer_size = 3 

model = MultiLayerLSTM(input_size, hidden_size, hidden_layer_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.5)

In [None]:
torch.cuda.empty_cache()

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        train_loss = 0.0
        for data, labels in tqdm(train_loader):
            optimizer.zero_grad()
            data, labels = data.to(device, non_blocking=True), labels.to(device, non_blocking=True).float().view(-1, 1)
            outputs = model(data.squeeze(0))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.size(0)
        train_loss /= len(train_loader.dataset)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, labels in tqdm(val_loader):
                data, labels = data.to(device, non_blocking=True), labels.to(device, non_blocking=True).float().view(-1, 1)
                outputs = model(data.squeeze(0))
                val_loss += criterion(outputs, labels).item() * data.size(0)
            val_loss /= len(val_loader.dataset)

        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Step the scheduler with validation loss
        scheduler.step(val_loss)

# Make sure you create the model, criterion, optimizer, and scheduler before

In [None]:
import matplotlib.pyplot as plt
def evaluate_and_plot_model(model, test_loader):
    device = torch.device("cuda")
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode
    mse_loss = torch.nn.MSELoss()  # Initialize mean squared error loss
    total_loss = 0.0
    total_samples = 0

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for data, labels in tqdm(test_loader):
            data, labels = data.to(device, non_blocking=True), labels.to(device, non_blocking=True).float().view(-1, 1)
            outputs = model(data.squeeze(0))
            loss = mse_loss(outputs, labels)
            total_loss += loss.item() * data.size(0)  # Multiply loss by batch size for accurate total
            total_samples += data.size(0)

            all_predictions.extend(outputs.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())

    average_loss = total_loss / total_samples
    print(f'Average MSE Loss: {average_loss}')

    # Plotting predictions vs actual labels
    plt.figure(figsize=(10, 6))
    plt.scatter(all_labels, all_predictions, alpha=0.5)
    plt.title('Predictions vs Actual Labels')
    plt.xlabel('Actual Labels')
    plt.ylabel('Predictions')
    plt.grid(True)
    plt.plot([min(all_labels), max(all_labels)], [min(all_labels), max(all_labels)], 'k--')  # A reference line for perfect predictions
    plt.show()

    return average_loss, all_predictions, all_labels

In [None]:
# Training the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=15)


In [None]:
# Evaluating the model
average_loss = evaluate_and_plot_model(model, test_loader)

In [None]:
from torchviz import make_dot

# Assume 'model' is your LSTM model and 'test_loader' is your DataLoader
device = torch.device("cuda")
model = model.to(device)
model.eval()  # Set the model to evaluation mode

# Get a batch of data; for visualization, one batch is enough
for inputs, targets in train_loader:
    inputs = inputs.to(device)  # Move inputs to the appropriate device
    outputs = model(inputs)     # Forward pass
    make_dot(outputs, params=dict(list(model.named_parameters()))).render("lstm_model", format="png")  # Visualize the model
    break  # We only need one batch for visualization