In [1]:
import pandas as pd
from docx import Document

video_names = ['B', 'C', 'D', 'E', 'F']

retention_data_list = []
script_data_list = []

def extract_text_from_docx(docx_filename):
    doc = Document(docx_filename)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text)

for video in video_names:
    retention_file_path = f'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO {video}.xlsx'
    script_file_path = f'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO {video}.docx'

    retention_data = pd.read_excel(retention_file_path).iloc[:100, :2]
    retention_data_list.append(retention_data)

    script_text = extract_text_from_docx(script_file_path)
    script_data_list.append(script_text)

In [2]:
print("Retention Data for VIDEO B:")
print(retention_data_list[0].head(10))

Retention Data for VIDEO B:
  Video position (%) Absolute audience retention (%)
0                  0                          103.63
1                  1                           87.91
2                  2                           84.54
3                  3                           81.15
4                  4                           81.26
5                  5                              80
6                  6                           80.35
7                  7                           79.02
8                  8                           79.04
9                  9                           78.65


In [3]:
print(retention_data_list[1]) 
print(script_data_list[1][:500])  


   Video position (%) Absolute audience retention (%)
0                   0                          103.97
1                   1                           89.17
2                   2                           85.68
3                   3                           84.62
4                   4                           83.14
..                ...                             ...
95                 95                           52.08
96                 96                            51.9
97                 97                           51.53
98                 98                           44.35
99                 99                           29.98

[100 rows x 2 columns]
Embarrassingly Dumb Ways People Died - Darwin Awards Winners Part 2
Natural selection is mother nature’s way of making sure people get smarter - but even after 200,000 years of human evolution, some of us are still making really terrible choices. Let’s take a look at some of the most foolish examples now.
20. Way Too Hot Sprin

Data Preprocessing

In [4]:
import numpy as np

def split_script_into_parts(script_text, num_parts=100):
    part_length = len(script_text) // num_parts
    script_parts = []

    for i in range(num_parts):
        start_index = i * part_length
        end_index = (i + 1) * part_length if i < num_parts - 1 else len(script_text)
        script_parts.append(script_text[start_index:end_index])

    return script_parts

split_scripts_list = []

for script_text in script_data_list:
    split_script = split_script_into_parts(script_text)
    split_scripts_list.append(split_script)

In [5]:
for i, split_script in enumerate(split_scripts_list):
    print(f"Video {video_names[i]} - First 5 parts of the script:")
    for j in range(min(5, len(split_script))):  # Show the first 5 parts for each video
        print(f"Part {j+1}: {split_script[j][:100]}...")  # Display first 100 characters of each part
    print("\n" + "-"*50 + "\n")


Video B - First 5 parts of the script:
Part 1: When Charles Darwin developed his Theory of Evolution he discovered animals bred selectively, so the...
Part 2: the species were more likely to mate and pass on their DNA. Darwin called this Natural Selection. Bu...
Part 3: f so stupid they actually die, and so no longer have the ability to pass their DNA to future generat...
Part 4:  the Darwin Awards were created as a tongue in cheek way to remember those people who have removed t...
Part 5: rough their own stupidity. Coming up, are the top 20 dumbest of the dumb Darwin Award winners.

20. ...

--------------------------------------------------

Video C - First 5 parts of the script:
Part 1: Embarrassingly Dumb Ways People Died - Darwin Awards Winners Part 2
Natural selection is mother natu...
Part 2: - but even after 200,000 years of human evolution, some of us are still making really terrible choic...
Part 3: oolish examples now.
20. Way Too Hot Spring
Have you ever fancied a dip in a 

In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
e_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def get_embeddings(script_part):
    inputs = tokenizer(script_part, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = e_model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1)  
    return embeddings.numpy()

In [9]:
embeddings_list = []

for split_script in split_scripts_list:
    video_embeddings = []

    for part in split_script:
        embedding = get_embeddings(part)
        video_embeddings.append(embedding)

    embeddings_list.append(np.array(video_embeddings))

In [10]:
print("Checking the shape of the first embedding for the first part of the first video:")
first_video_embeddings = embeddings_list[0]
print(f"Embedding shape for the first part: {first_video_embeddings[0].shape}")

Checking the shape of the first embedding for the first part of the first video:
Embedding shape for the first part: (1, 768)


In [11]:
for i, video_embeddings in enumerate(embeddings_list):
    print(f"Video {video_names[i]} has {len(video_embeddings)} embeddings.")
    if len(video_embeddings) != 100:
        print(f"Error: Video {video_names[i]} does not have 100 parts!")

Video B has 100 embeddings.
Video C has 100 embeddings.
Video D has 100 embeddings.
Video E has 100 embeddings.
Video F has 100 embeddings.


In [12]:
def combine_embeddings_with_position(embeddings, retention_data):
    combined_features = []

    for i, embedding in enumerate(embeddings):
        position = retention_data.iloc[i, 0] 
        flattened_embedding = embedding.flatten()

        combined_vector = np.concatenate([flattened_embedding, [position]])

        combined_features.append(combined_vector)

    return np.array(combined_features)

final_dataset_list = []

for i, video_embeddings in enumerate(embeddings_list):
    retention_data = retention_data_list[i].iloc[:100, :2]  

    combined_data = combine_embeddings_with_position(video_embeddings, retention_data)

    final_dataset_list.append(combined_data)

In [13]:
print(f"Number of videos in final dataset: {len(final_dataset_list)}")

print(f"Shape of combined data for first video: {final_dataset_list[0].shape}")

print(f"Shape of combined data for last video: {final_dataset_list[-1].shape}")

Number of videos in final dataset: 5
Shape of combined data for first video: (100, 769)
Shape of combined data for last video: (100, 769)


In [14]:
from sklearn.model_selection import train_test_split

X_all = []
Y_all = []

for i, video_data in enumerate(final_dataset_list):
    retention_data = retention_data_list[i].iloc[:100, 1]  

    X_all.append(video_data)
    Y_all.append(retention_data.values)

X_all = np.concatenate(X_all, axis=0)  
Y_all = np.concatenate(Y_all, axis=0)  

X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (400, 769)
X_test shape: (100, 769)
Y_train shape: (400,)
Y_test shape: (100,)


Model training

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def eval(Y_test, Y_pred):
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    mae = mean_absolute_error(Y_test, Y_pred)
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)

    mean_target = Y_test.mean()
    mean_squared_target = (Y_test ** 2).mean()

    mae_percentage = (mae / mean_target) * 100 
    mse_percentage = (mse / mean_squared_target) * 100 

    print(f"Model: {type(model).__name__}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"MAE as Percentage of Mean Target: {mae_percentage:.2f}%")
    print(f"MSE as Percentage of Mean Target Squared: {mse_percentage:.2f}%")

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)
eval(Y_test, Y_pred)

Model: GradientBoostingRegressor
Mean Absolute Error (MAE): 1.7302
Mean Squared Error (MSE): 5.4279
R-squared (R²): 0.9578
MAE as Percentage of Mean Target: 2.82%
MSE as Percentage of Mean Target Squared: 0.14%


Prediction for Video A


In [17]:
video_a_retention_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO A.xlsx'
video_a_script_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO A.docx'

video_a_retention = pd.read_excel(video_a_retention_path).iloc[:100, :2]

video_a_script_text = extract_text_from_docx(video_a_script_path)

video_a_script_parts = split_script_into_parts(video_a_script_text)

print(f"Retention data (first 7 rows):\n{video_a_retention.head(7)}")
for i in range(6):
    print(f"Script part {i}:\n{video_a_script_parts[i]}")


Retention data (first 7 rows):
  Video position (%) Absolute audience retention (%)
0                  0                             NaN
1                  1                             NaN
2                  2                             NaN
3                  3                             NaN
4                  4                             NaN
5                  5                             NaN
6                  6                           80.63
Script part 0:
Embarrassingly Dumb Ways People Died [Part 8]
We’d all like to be remembered for something great. But, surprisingly often, inc
Script part 1:
redibly dumb folks leave this world with a parting tale they’d probably prefer to be forgotten. That said, some of these spect
Script part 2:
acularly dim-witted, accidental exit strategies are irresistibly entertaining to hear about. So, without further ado, let’s ch
Script part 3:
eck out some of the most Embarrassingly Dumb Ways People Died.
Labels? What Labels? 
Whenever blowtorche

In [18]:
video_a_embeddings = []

for part in video_a_script_parts:
    embedding = get_embeddings(part)
    video_a_embeddings.append(embedding)

video_a_embeddings = np.array(video_a_embeddings).squeeze()

print(f"Video A embeddings shape: {video_a_embeddings.shape}")

Video A embeddings shape: (100, 768)


In [19]:
video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)

video_a_features = np.hstack((video_a_embeddings, video_a_positions))

print(f"Final feature set shape: {video_a_features.shape}")
# print(f"First feature vector:\n{video_a_features[0]}")


Final feature set shape: (100, 769)


In [20]:
video_a_predictions = model.predict(video_a_features)

print(f"Predicted retention percentages for Video A (first 10):\n{video_a_predictions[:10]}")

Predicted retention percentages for Video A (first 10):
[98.51392039 87.4613551  83.23214272 82.95397661 79.43626736 79.73905597
 73.76885889 79.20397959 76.47252405 79.26680629]


New intro with GPT 4

In [32]:
from docx import Document

def write_to_docx(file_path, content):
    doc = Document()
    
    for paragraph in content:
        doc.add_paragraph(paragraph)
    
    doc.save(file_path)

file_path = "new.docx" 

video_a_retention_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO A.xlsx'
video_a_script_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/new.docx'

video_a_retention = pd.read_excel(video_a_retention_path).iloc[:4, :2]

In [None]:
from openai import OpenAI

def llm_call(prompt):
    openai = OpenAI(
    api_key= 'your_key')

    message = [
        {"role":"user", "content": prompt}
    ]

    response= openai.chat.completions.create(
        messages= message,
        model= "gpt-3.5-turbo"
    )

    return response.choices[0].message.content


full_text = '''Embarrassingly Dumb Ways People Died [Part 8]
We’d all like to be remembered for something great. But, surprisingly often, incredibly dumb folks leave this world with a parting tale they’d probably prefer to be forgotten. That said, some of these spectacularly dim-witted, accidental exit strategies are irresistibly entertaining to hear about. So, without further ado, let’s check out some of the most Embarrassingly Dumb Ways People Died.
'''

prompt = f"""The goal is to improve this video script intro to increase audience retention. Focus on:
- Capturing attention within the first 10 seconds.
- Using clear, vivid, and relatable language.
- Creating curiosity or excitement about the content.
- Ensuring the tone matches the audience (e.g., humorous, dramatic, or suspenseful).

Here is the current intro:
{full_text}

Rewrite the intro to maximize engagement and retention.
"""
# print(prompt)

r_into= np.array([98.51392039, 87.4613551,  83.23214272, 82.95397661])

for i in range(60):
    print(f"Ititration {i+1}\n")
    split_scripts_list = []
    video_a_embeddings = []

    result= llm_call(prompt)
    print("Result= ", result)

    with open('content.txt', 'a', encoding="utf-8") as file:
        file.write(result)
    
    write_to_docx(file_path, [result])

    video_a_script_text = extract_text_from_docx(video_a_script_path)

    video_a_script_parts = split_script_into_parts(video_a_script_text, 4)

    # Check results
    # print(f"Retention data (first 7 rows):\n{video_a_retention}")
    # for i in range(4):
    #     print(f"Script part {i}:\n{video_a_script_parts[i]}")
    
    for part in video_a_script_parts:
        embedding = get_embeddings(part)
        video_a_embeddings.append(embedding)

    video_a_embeddings = np.array(video_a_embeddings).squeeze()

    # print(f"Video A embeddings shape: {video_a_embeddings.shape}")

    video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)

    video_a_features = np.hstack((video_a_embeddings, video_a_positions))

    # Check the shape of the final feature set
    # print(f"Final feature set shape: {video_a_features.shape}")
    # print(f"First feature vector:\n{video_a_features[0]}")

    new= model.predict(video_a_features)
    
    if (r_into<new).all():
        print(f"**************Found better script with retention {new}**************")
        break
    else:
        print(f"Bad Retention {new}\n\n")

Ititration 1

Result=  Get ready to cringe, laugh, and maybe shed a tear for humanity's lack of common sense. Welcome to Part 8 of Embarrassingly Dumb Ways People Died. From epic fails to mind-boggling mishaps, you won't believe how these individuals managed to meet their unlikely fate. So, grab your popcorn and get ready for a wild ride as we dive into some of the most outrageous stories of accidental exits. Let's unravel the mystery of how the not-so-bright meet their untimely end.
Bad Retention [97.67611867 86.71235258 85.1554384  81.68914314]


Ititration 2

Result=  Get ready to cringe, laugh, and shake your head in disbelief as we dive into the world of the most Embarrassingly Dumb Ways People Died [Part 8]. We all strive to leave behind a legacy, but some individuals choose the path of ridiculousness in their final moments. From mind-boggling mishaps to jaw-dropping blunders, get ready to be both entertained and baffled. So grab a seat and prepare to be amazed by the incredibly 

In [None]:
# 98.51392039, 87.4613551,  83.23214272, 82.95397661
video_a_retention_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/VIDEO A.xlsx'
video_a_script_path = 'C:/Users/bibas/Downloads/GenAI/pytorch/bam/t_2/with new intro.docx'

video_a_retention = pd.read_excel(video_a_retention_path).iloc[:100, :2]

video_a_script_text = extract_text_from_docx(video_a_script_path)

video_a_script_parts = split_script_into_parts(video_a_script_text)

# print(f"Retention data (first 7 rows):\n{video_a_retention.head(7)}")
# for i in range(6):
#     print(f"Script part {i}:\n{video_a_script_parts[i]}")

video_a_embeddings = []

for part in video_a_script_parts:
    embedding = get_embeddings(part)
    video_a_embeddings.append(embedding)

video_a_embeddings = np.array(video_a_embeddings).squeeze()

# print(f"Video A embeddings shape: {video_a_embeddings.shape}")

video_a_positions = video_a_retention['Video position (%)'].values.reshape(-1, 1)

video_a_features = np.hstack((video_a_embeddings, video_a_positions))

# print(f"Final feature set shape: {video_a_features.shape}")
# print(f"First feature vector:\n{video_a_features[0]}")

video_a_predictions = model.predict(video_a_features)

print(f"Predicted retention percentages for Video new intro (first 10):\n{video_a_predictions[:7]}")


Predicted retention percentages for Video new intro (first 10):
[101.76653421  89.45836529  84.08765851  83.94456804  80.08805334
  78.63243904  79.61561589]
