In [1]:
import pandas as pd

df = pd.read_parquet('movie_scripts.parquet', engine='pyarrow')

In [3]:
df = pd.read_csv('utterances.csv')
print(df.head)

# Apply to each row
df['distinct_1'] = df['utterance'].apply(distinct_1)

# Compute the average Distinct-1 score across all scripts
mean_distinct_1 = df['distinct_1'].mean()

print(f"Average Distinct-1 over {len(df)} scripts: {mean_distinct_1:.4f}")

<bound method NDFrame.head of          episode  episode_order  \
0          57264              9   
1          57264             10   
2          57264             11   
3          57264             12   
4          57264             13   
...          ...            ...   
3199853    67560             11   
3199854    67560             12   
3199855    67560             13   
3199856    67560             14   
3199857    67560             15   

                                                   speaker  \
0        Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...   
1        Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...   
2                                         NEAL CONAN, host   
3        Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...   
4                                         NEAL CONAN, host   
...                                                    ...   
3199853                                        _NO_SPEAKER   
3199854                            MARY LOUISE KELLY,

In [4]:
def distinct_1_episode(texts: pd.Series) -> float:
    """
    Compute Distinct-1 over all utterances in an episode:
      (# unique unigrams across the episode) 
      / (total # unigrams across the episode).
    """
    # concatenate all utterances into one big string
    all_tokens = (
        " ".join(texts.dropna().astype(str))
        .lower()
        .split()
    )
    if not all_tokens:
        return 0.0
    return len(set(all_tokens)) / len(all_tokens)

# assume your episode identifier column is named "episode"
episode_scores = (
    df
    .groupby("episode")["utterance"]
    .apply(distinct_1_episode)
    .reset_index(name="distinct_1_score")
)

# peek at per‐episode scores
print(episode_scores.head())

# compute the average across episodes
mean_ep_score = episode_scores["distinct_1_score"].mean()
print(f"Average episode‐level Distinct-1: {mean_ep_score:.4f}")


   episode  distinct_1_score
0        1          0.521545
1        2          0.484067
2        3          0.544326
3        4          0.506652
4        5          0.407384
Average episode‐level Distinct-1: 0.5024


In [5]:
# 1) Check how many unique episode IDs you really have:
print("Raw unique count:", df['episode'].nunique())
print("Some raw values:", df['episode'].unique()[:20])

# 2) Clean up whitespace/case (very common culprit):
df['episode_clean'] = (
    df['episode']
      .astype(str)        # make sure everything is string
      .str.strip()         # remove leading/trailing spaces
      .str.lower()         # unify case, if e.g. “Ep1” vs “ep1”
)

# 3) Re-inspect:
print("Cleaned unique count:", df['episode_clean'].nunique())
print("Some cleaned values:", df['episode_clean'].unique()[:20])
print(df['episode_clean'].value_counts().head(10))

# 4) Now group on the cleaned IDs:
def distinct_1_episode(texts):
    tokens = " ".join(texts.dropna().astype(str)).lower().split()
    return len(set(tokens)) / len(tokens) if tokens else 0.0

episode_scores = (
    df
      .groupby('episode_clean')['utterance']
      .apply(distinct_1_episode)
      .reset_index(name='distinct_1_score')
)

print(episode_scores)
print("Average over episodes:", episode_scores['distinct_1_score'].mean())


Raw unique count: 104920
Some raw values: [57264 58225 75004 74884 63416 68175 67560 68670 70039 75394 58131 68325
 80651 60226 72046 80598 68420 79679 60873 65791]
Cleaned unique count: 104920
Some cleaned values: ['57264' '58225' '75004' '74884' '63416' '68175' '67560' '68670' '70039'
 '75394' '58131' '68325' '80651' '60226' '72046' '80598' '68420' '79679'
 '60873' '65791']
episode_clean
19633    569
35108    552
57481    539
73336    492
66523    480
65241    461
60824    445
52192    441
81521    431
82700    426
Name: count, dtype: int64
       episode_clean  distinct_1_score
0                  1          0.521545
1                 10          0.436911
2                100          0.511384
3              10000          0.441989
4             100000          0.558205
...              ...               ...
104915         99995          0.660274
104916         99996          0.538937
104917         99997          0.524800
104918         99998          0.409127
104919         99999  

In [6]:
std_ep  = episode_scores['distinct_1_score'].std()
print(f"STD of episode‐level Distinct-1: {std_ep:.4f}")

STD of episode‐level Distinct-1: 0.1052
