In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import joblib
from hdbscan import prediction as hdbscan_prediction

In [3]:
df = pd.read_pickle("/content/drive/My Drive/speech_ml/data/df_with_embeddings.pkl")

def compute_structural_features(text):
    import re
    words = text.split()
    num_words = len(words)
    num_chars = len(text)
    num_commas = text.count(',')
    num_periods = text.count('.')
    num_exclaims = text.count('!')
    num_questions = text.count('?')
    unique_words = len(set(words))
    fraction_unique_words = unique_words / (num_words + 1e-5)

    return pd.Series({
        'num_words': num_words,
        'num_chars': num_chars,
        'num_commas': num_commas,
        'num_periods': num_periods,
        'num_exclaims': num_exclaims,
        'num_questions': num_questions,
        'unique_words': unique_words,
        'fraction_unique_words': fraction_unique_words
    })

structural_df = df['Chunk'].apply(compute_structural_features)
df = pd.concat([df, structural_df], axis=1)



def predict_completion(chunk_text):
    # Loading saved components
    model = joblib.load('/content/drive/My Drive/speech_ml/data/lgbm_model.pkl')
    pca = joblib.load('/content/drive/My Drive/speech_ml/data/pca_transformer.pkl')
    umap_model = joblib.load('/content/drive/My Drive/speech_ml/data/umap_model.pkl')
    clusterer = joblib.load('/content/drive/My Drive/speech_ml/data/hdbscan_model.pkl')
    feature_cols = joblib.load('/content/drive/My Drive/speech_ml/data/feature_columns.pkl')

    # Step 1: SBERT embedding
    embed_model = SentenceTransformer("all-mpnet-base-v2")
    embedding = embed_model.encode([chunk_text])[0]

    # Step 2: PCA
    pca_features = pca.transform([embedding])[0]
    pca_dict = {f'pca_{i+1}': pca_features[i] for i in range(5)}

    # Step 3: Cluster via UMAP -> HDBSCAN
    umap_embed = umap_model.transform([embedding])
    cluster_label, _ = hdbscan_prediction.approximate_predict(clusterer, umap_embed)
    cluster_label = cluster_label[0]
    is_noise = int(cluster_label == -1)

    # Step 4: Structural features
    struct_feats = compute_structural_features(chunk_text)

    # Step 5: Time-aware cluster features (default since no sequence context)
    cluster_seen_before = 0
    fraction_unique_clusters = 1.0

    # Merge all
    feature_dict = {
        **struct_feats,
        'cluster': cluster_label,
        'is_noise': is_noise,
        'cluster_seen_before': cluster_seen_before,
        'fraction_unique_clusters': fraction_unique_clusters,
        **pca_dict
    }

    # Creating DF and reorder columns
    X = pd.DataFrame([feature_dict])[feature_cols]

    # Predicting
    prediction = model.predict(X)[0]
    return round(prediction, 2)


In [9]:
text_snippet = "So be grateful. Be alive and live every moment. Thank you so much, everyone."
predicted_percent = predict_completion(text_snippet)
print(f"Predicted Speech Completion: {predicted_percent} %")


Predicted Speech Completion: 93.86 %
