In [None]:
!pip install pandas numpy scikit-learn transformers torch nltk

# Import tools

In [2]:
import re
import math
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer


df=pd.read_csv('All_capped_keywords.csv')  
df = df.head(1000)

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
nltk.download('vader_lexicon', quiet=True)
VADER = SentimentIntensityAnalyzer()

SCI_MODEL = "allenai/scibert_scivocab_cased"
tok = AutoTokenizer.from_pretrained(SCI_MODEL)
mlm = AutoModelForMaskedLM.from_pretrained(SCI_MODEL).eval()

# Helper Function (Punctuation + Scientific + Sentiment Score)

In [22]:
def punctuation_score(title):
    """Estimate how much punctuation/special characters are in a title"""
    t, L = title, len(title) #L is used for normalization - so longer titles don’t get inflated punctuation scores just because they have more characters.
    excl=t.count("!")
    quest=t.count("?")
    colon=t.count(":")
    semi=t.count(";")
    quote=t.count('"') + t.count("'")
    paren=t.count("(") + t.count(")")
    dash=t.count("-") + t.count("—")
    ellip=t.count("...")
    # unique_types adds up how many different kinds of punctuation appear in the title
    unique_types = sum(int(x > 0) for x in [excl, quest, colon, semi, quote, paren, dash, ellip])
 
    base = (0.35*(excl+quest)/(L/20) +
            0.25*(colon+semi)/(L/30) +
            0.25*(quote+paren+dash)/(L/25)) #20, 30, 25 are heuristic normalizers to keep these features in similar ranges

    base += 0.08*unique_types #adding very small but considerate bonus for diversity of punctuation

    return 1.0 / (1.0 + math.exp(-(base - 1.0))) #logistic sigmoid function, but shifting the curve so that base=1.0 maps to 0.5 on sigmoid (neutral)

def sentiment_pos_prob(text):
    """Estimate how positive a title is"""
    s = VADER.polarity_scores(text)['compound'] # returns score from -1 to 1
    return (s + 1.0) / 2.0 #linearly mapping [-1, 1] to [0, 1]

@torch.no_grad() #decorator to disable extra/complex calculations 
def avg_logprob_scibert(title, max_masks: 5):
    """Estimate how 'scientific' a title looks"""
    inp = tok.encode(title, return_tensors="pt") # tokenize and convert to pyTorch tensor
    L = inp.size(1) # length of tokenized input sequence
    if L <= 2: # if too short to mask anything
        return -1e9 
    inner = list(range(1, L-1)) # indices of tokens that can be masked (excluding [CLS] at position 0 and [SEP] at the end)
    if len(inner) > max_masks: # if too many inner tokens, sample a few evenly spaced ones
        step = max(1, len(inner)//max_masks) # step size to get roughly max_masks tokens
        inner = inner[::step][:max_masks] # take every 'step'-th token, up to max_masks

    logps = [] # list to store log-probabilities
    for i in inner:
        masked = inp.clone() # clone the input tensor
        masked[0, i] = tok.mask_token_id # mask the i-th token
        logits = mlm(masked).logits[0, i] # get logits for the masked position
        true_id = inp[0, i] # the true token id at position i
        logp = torch.log_softmax(logits, dim=-1)[true_id].item() # log-probability of the true token
        logps.append(logp) # append to list
    if logps:
        return float(np.mean(logps)) # return average log-probability
    else:
        return -1e9 #safety fallback

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# Build features

In [23]:
rows = [] # list to store feature dicts
for title in df["title"]:
    sci_lp  = avg_logprob_scibert(title, max_masks=5)   #capped at 5 inner tokens for masking to preserve computation time
    punct   = punctuation_score(title) # 0–1 punctuation/specials playfulness score
    sentpos = sentiment_pos_prob(title) # 0–1 positive sentiment estimate
    rows.append({"title": title, "sci_avg_logp": sci_lp, "punct_score": punct, "sent_pos": sentpos}) # append feature dict to list

feat = pd.DataFrame(rows)

# Building Humor score (0–1)

In [24]:
# Humor score (0to 1)
# Lower SciBERT  => more playful
# Higher punctuation => more playful
# Higher positive tone => (slightly) more playful
# Normalize via z-score
for col in ["sci_avg_logp", "punct_score", "sent_pos"]:
    mu, sd = feat[col].mean(), feat[col].std() + 1e-8 # avoid div-by-zero
    feat[f"z_{col}"] = (feat[col] - mu) / sd # z-score normalization

# linear combo 
# negative weight for sci_avg_logp (lower sci_lp => more playful)
W_SCI, W_PUNC, W_SENT = -1.0, 0.8, 0.3  # weights for linear combination
Z = W_SCI*feat["z_sci_avg_logp"] + W_PUNC*feat["z_punct_score"] + W_SENT*feat["z_sent_pos"] 
feat["humor_score"] = sigmoid(Z.values) # map to [0, 1] via sigmoid

# K-means classification vs Threshold Classification
- If TRY_CLUSTER is True, runs a K-Means (k=2) on the three features.
- Labels the cluster with the higher mean humor_score as the “playful” cluster.
- Saves that binary decision to pred_playful (1 = playful, 0 = not).
- If TRY_CLUSTER is False, it instead uses a threshold on humor_score (≥0.5).

In [None]:

TRY_CLUSTER = True
if TRY_CLUSTER:
    from sklearn.cluster import KMeans
    X = np.vstack([feat["z_sci_avg_logp"], feat["z_punct_score"], feat["z_sent_pos"]]).T # shape (N, 3)
    km = KMeans(n_clusters=2, random_state=42, n_init="auto").fit(X) # fit KMeans
    feat["cluster_raw"] = km.labels_ # assign cluster labels

    # pick cluster with higher mean humor_score as "playful"
    means = feat.groupby("cluster_raw")["humor_score"].mean()
    playful_cluster = int(means.idxmax()) # cluster label (0 or 1) with higher mean humor_score
    feat["pred_playful"] = (feat["cluster_raw"] == playful_cluster).astype(int) # binary playful prediction
else:
    # or just use a threshold on humor_score
    feat["pred_playful"] = (feat["humor_score"] >= 0.5).astype(int)



# Results
How to interpret the outputs
- sci_avg_logp: higher means “more academic-sounding”; very low values suggest too playful for scientific text.
- punct_score: more creative punctuation -> closer to 1.
- sent_pos: more positive vibe -> closer to 1
- humor_score: overall 0 – 1 score combining the three.
- pred_playful: quick yes/no label from K-Means or threshold, for convenience.

In [26]:
print(feat[["title","sci_avg_logp","punct_score","sent_pos","humor_score","pred_playful"]]
      .sort_values("humor_score", ascending=False)
      .to_string(index=False))

                                                                                                                                               title  sci_avg_logp  punct_score  sent_pos  humor_score  pred_playful
                                                                                                       Robust Training for AC-OPF (Student Abstract)     -6.417920     0.395719   0.67000     0.979915             0
                                                              More the Merrier: Towards Multi-Emotion and Intensity Controllable Response Generation     -7.920193     0.336235   0.72745     0.967977             1
                                                                                        Text Gestalt: Stroke-Aware Scene Text Image Super-Resolution     -6.914422     0.375975   0.50000     0.963857             0
                                                                                      PHASEN: A Phase-and-Harmonics-Aware Speech Enhancement Network