In [None]:
import pandas as pd
import numpy as np
from docx import Document

# List of video names we’ll be processing
video_names = ['B', 'C', 'D', 'E', 'F']

# Lists to store the retention and script data for each video
retention_data_list = []
script_data_list = []

# Function to extract the text from a .docx file
def extract_text_from_docx(docx_filename):
    doc = Document(docx_filename)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text)

# Now let’s go through each video in the list
for video in video_names:
    retention_file_path = f'your_path/VIDEO {video}.xlsx'
    script_file_path = f'your_path/VIDEO {video}.docx'

    retention_data = pd.read_excel(retention_file_path).iloc[:100, :2]
    retention_data_list.append(retention_data)

    script_text = extract_text_from_docx(script_file_path)
    script_data_list.append(script_text)

Data Preprocessing

In [None]:
# Function to split the script text into smaller parts
def split_script_into_parts(script_text, num_parts=100):
    part_length = len(script_text) // num_parts
    script_parts = []
    
    # Loop through the number of parts and slice the script accordingly
    for i in range(num_parts):
        start_index = i * part_length
        end_index = (i + 1) * part_length if i < num_parts - 1 else len(script_text)
        script_parts.append(script_text[start_index:end_index])

    return script_parts

# Now, let’s process each script text
split_scripts_list = []

for script_text in script_data_list:
    split_script = split_script_into_parts(script_text)
    split_scripts_list.append(split_script)

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Loading the pre-trained DistilBERT tokenizer/model.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
e_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Function to get embeddings for a given script part
def get_embeddings(script_part):
    inputs = tokenizer(script_part, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = e_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  
    return embeddings.numpy()

In [None]:
embeddings_list = []

# Iterate over each script in the split_scripts_list
for split_script in split_scripts_list:
    video_embeddings = []
    # Iterate over each part in the split script
    for part in split_script:
        embedding = get_embeddings(part)
        video_embeddings.append(embedding)
    embeddings_list.append(np.array(video_embeddings))

In [None]:
# Loop through each video's embeddings in the embeddings_list
for i, video_embeddings in enumerate(embeddings_list):
    print(f"Video {video_names[i]} has {len(video_embeddings)} embeddings.")
    if len(video_embeddings) != 100:
        print(f"Error: Video {video_names[i]} does not have 100 parts!")

Video B has 100 embeddings.
Video C has 100 embeddings.
Video D has 100 embeddings.
Video E has 100 embeddings.
Video F has 100 embeddings.


In [None]:
# Function to combine embeddings with video position data
def combine_embeddings_with_position(embeddings, retention_data):
    combined_features = []

    for i, embedding in enumerate(embeddings):
        position = retention_data.iloc[i, 0] 
        flattened_embedding = embedding.flatten()

        combined_vector = np.concatenate([flattened_embedding, [position]])

        combined_features.append(combined_vector)

    return np.array(combined_features)

final_dataset_list = []

# Iterate through each video's embeddings and combine with its retention data
for i, video_embeddings in enumerate(embeddings_list):
    retention_data = retention_data_list[i].iloc[:100, :2]  

    combined_data = combine_embeddings_with_position(video_embeddings, retention_data)

    final_dataset_list.append(combined_data)

In [40]:
from sklearn.model_selection import train_test_split

X_all = []
Y_all = []

# Loop through each video data in the final dataset list
for i, video_data in enumerate(final_dataset_list):
    retention_data = retention_data_list[i].iloc[:100, 1]  
    X_all.append(video_data)
    Y_all.append(retention_data.values)

# Concatenate the lists to create the complete feature matrix (X_all) and target vector (Y_all)
X_all = np.concatenate(X_all, axis=0)  
Y_all = np.concatenate(Y_all, axis=0)  

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=0.2, random_state=42)

Model training

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluation function to assess model performance
def eval(Y_test, Y_pred):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    mae = mean_absolute_error(Y_test, Y_pred)
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    mean_target = Y_test.mean()
    mean_squared_target = (Y_test ** 2).mean()
    mae_percentage = (mae / mean_target) * 100 
    mse_percentage = (mse / mean_squared_target) * 100 
    
    # Print the results of model evaluation
    print(f"Model: {type(model).__name__}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"MAE as Percentage of Mean Target: {mae_percentage:.2f}%")
    print(f"MSE as Percentage of Mean Target Squared: {mse_percentage:.2f}%")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the GradientBoostingRegressor model with 100 estimators (trees)
model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# This trains the model to learn the patterns between features and target values
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

# Call the eval function to evaluate the model's performance on the test set
eval(Y_test, Y_pred)

Model: GradientBoostingRegressor
Mean Absolute Error (MAE): 1.7302
Mean Squared Error (MSE): 5.4279
R-squared (R²): 0.9578
MAE as Percentage of Mean Target: 2.82%
MSE as Percentage of Mean Target Squared: 0.14%


Prediction for Video A


In [None]:
# Define the file paths
video_a_retention_path = 'your_path/VIDEO A.xlsx'
video_a_script_path = 'your_path/VIDEO A.docx'

video_a_retention = pd.read_excel(video_a_retention_path).iloc[:100, :2]

# Extract the script text from the Word document (video script)
video_a_script_text = extract_text_from_docx(video_a_script_path)
video_a_script_parts = split_script_into_parts(video_a_script_text)

In [None]:
video_a_embeddings = []

# Loop through each part of the script for 'Video A'
for part in video_a_script_parts:
    embedding = get_embeddings(part)
    video_a_embeddings.append(embedding)

video_a_embeddings = np.array(video_a_embeddings).squeeze()
print(f"Video A embeddings shape: {video_a_embeddings.shape}")

Video A embeddings shape: (100, 768)


In [None]:
# Extract the 'Video position (%)' values from the retention data and reshape it into a 2D array
video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)
# Combine the embeddings and position data horizontally to create the feature set for prediction
video_a_features = np.hstack((video_a_embeddings, video_a_positions))

Final feature set shape: (100, 769)


In [None]:
# Use the trained model to predict retention percentages for 'Video A' based on the combined features
video_a_predictions = model.predict(video_a_features)
print(f"Predicted retention percentages for Video A (first 10):\n{video_a_predictions[:10]}")

Predicted retention percentages for Video A (first 10):
[98.51392039 87.4613551  83.23214272 82.95397661 79.43626736 79.73905597
 73.76885889 79.20397959 76.47252405 79.26680629]


New intro with GPT 4

In [None]:
from docx import Document

# Function to write content to a .docx file
def write_to_docx(file_path, content):
    doc = Document()
    
    for paragraph in content:
        doc.add_paragraph(paragraph)
    
    doc.save(file_path)

# Define the file path
file_path = "new.docx" 
video_a_retention_path = 'your_path/VIDEO A.xlsx'
video_a_script_path = 'your_path/new.docx'

# Read the first 4 rows of retention data from the Excel file
video_a_retention = pd.read_excel(video_a_retention_path).iloc[:4, :2]

In [None]:
from openai import OpenAI

# Function to call the OpenAI API and generate a response for the prompt
def llm_call(prompt):
    openai = OpenAI(
    api_key= 'your_key')

    message = [
        {"role":"user", "content": prompt}
    ]

    # Generate the response from GPT-3.5 model using the provided prompt
    response= openai.chat.completions.create(
        messages= message,
        model= "gpt-4"
    )

    return response.choices[0].message.content

# Initial video script (intro) to be improved
full_text = '''Embarrassingly Dumb Ways People Died [Part 8]
We’d all like to be remembered for something great. But, surprisingly often, incredibly dumb folks leave this world with a parting tale they’d probably prefer to be forgotten. That said, some of these spectacularly dim-witted, accidental exit strategies are irresistibly entertaining to hear about. So, without further ado, let’s check out some of the most Embarrassingly Dumb Ways People Died.
'''

# Define the prompt to instruct GPT to improve the intro
prompt = f"""The goal is to improve this video script intro to increase audience retention. Focus on:
- Capturing attention within the first 10 seconds.
- Using clear, vivid, and relatable language.
- Creating curiosity or excitement about the content.
- Ensuring the tone matches the audience (e.g., humorous, dramatic, or suspenseful).

Here is the current intro:
{full_text}

Rewrite the intro to maximize engagement and retention.
"""

# Initial old retention data to compare new retention against
r_into= np.array([98.51392039, 87.4613551,  83.23214272, 82.95397661])

# Loop through 60 iterations to improve the script and evaluate retention
for i in range(60):
    print(f"Ititration {i+1}\n")
    split_scripts_list = []
    video_a_embeddings = []
    # Call the OpenAI model to rewrite the script intro
    result= llm_call(prompt)
    print("Result= ", result)
    with open('content.txt', 'a', encoding="utf-8") as file:
        file.write(result)
    write_to_docx(file_path, [result])
    video_a_script_text = extract_text_from_docx(video_a_script_path)
    video_a_script_parts = split_script_into_parts(video_a_script_text, 4)   
    # Generate embeddings for each script part 
    for part in video_a_script_parts:
        embedding = get_embeddings(part)
        video_a_embeddings.append(embedding)
    video_a_embeddings = np.array(video_a_embeddings).squeeze()
    video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)
    video_a_features = np.hstack((video_a_embeddings, video_a_positions))
    new= model.predict(video_a_features)
    # Compare the new retention with the old to check if the new script is an improvement
    if (r_into<new).all():
        print(f"**************Found better script with retention {new}**************")
        break
    else:
        print(f"Bad Retention {new}\n\n")

Ititration 1

Result=  Get ready to cringe, laugh, and maybe shed a tear for humanity's lack of common sense. Welcome to Part 8 of Embarrassingly Dumb Ways People Died. From epic fails to mind-boggling mishaps, you won't believe how these individuals managed to meet their unlikely fate. So, grab your popcorn and get ready for a wild ride as we dive into some of the most outrageous stories of accidental exits. Let's unravel the mystery of how the not-so-bright meet their untimely end.
Bad Retention [97.67611867 86.71235258 85.1554384  81.68914314]


Ititration 2

Result=  Get ready to cringe, laugh, and shake your head in disbelief as we dive into the world of the most Embarrassingly Dumb Ways People Died [Part 8]. We all strive to leave behind a legacy, but some individuals choose the path of ridiculousness in their final moments. From mind-boggling mishaps to jaw-dropping blunders, get ready to be both entertained and baffled. So grab a seat and prepare to be amazed by the incredibly 

New prediction for Video A

In [None]:
# Define file paths
video_a_retention_path = 'your_path/VIDEO A.xlsx'
video_a_script_path = 'your_path/with new intro.docx'

video_a_retention = pd.read_excel(video_a_retention_path).iloc[:100, :2]
video_a_script_text = extract_text_from_docx(video_a_script_path)
video_a_script_parts = split_script_into_parts(video_a_script_text)
video_a_embeddings = []

# Loop through each script part to generate embeddings
for part in video_a_script_parts:
    embedding = get_embeddings(part)
    video_a_embeddings.append(embedding)
video_a_embeddings = np.array(video_a_embeddings).squeeze()
video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)
video_a_features = np.hstack((video_a_embeddings, video_a_positions))

# Use the trained machine learning model to predict retention percentages based on the features
video_a_predictions = model.predict(video_a_features)
print(f"Predicted retention percentages for Video new intro (first 10):\n{video_a_predictions[:7]}")

Predicted retention percentages for Video new intro (first 10):
[101.76653421  89.45836529  84.08765851  83.94456804  80.08805334
  78.63243904  79.61561589]
