In [1]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [1]:
import pandas as pd
import torch
import numpy as np
import evaluate
import re
import pycld2 as cld2  # language recognition
import os

from scipy.special import softmax
from datasets import Dataset , load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer 
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

# Let Jupyter Notebook show more content under cell
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

metric = evaluate.load("f1")

2024-02-07 01:01:23.754743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **1. Obtain the training and non-training data**
- df_before: "path", "content", "is_news" as columns
- Clean the corpus
  - music/audio tags
  - transcribing typos: Replace triple-double quotes with a single double quote. Replace single double quotes with commas
  - remove "content" at the beginning of every transcript
  - Some transcripts are non-sense due to an attempt to transcribe foreign languages. Can be excluded/discarded. **(resolved)**
  - **Remove transcripts with non-English words**
- Export the cleaned dataframe to a .csv file
- Downsample the training corpus
- Chunkify transcripts

In [2]:
df_before = pd.read_json('/shared/3/projects/benlitterer/podcastData/processed/beforeFloydMonth/5_4_5_10.jsonl', orient="records", lines=True)
df_before = df_before.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_before = df_before.drop_duplicates()
df_before = df_before.drop_duplicates(subset=['potentialOutPath'])
df_before['potentialOutPath'] = df_before['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/5_4_5_10" + x)

In [3]:
df_after = pd.read_json('/shared/3/projects/benlitterer/podcastData/processed/afterFloydMonth/6_9_6_15.jsonl', orient="records", lines=True)
df_after = df_after.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_after = df_after.drop_duplicates()
df_after = df_after.drop_duplicates(subset=['potentialOutPath'])
df_after['potentialOutPath'] = df_after['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/6_9_6_15" + x)

In [4]:
df_in = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEn.csv", lineterminator = '\n', low_memory=False)
df_in = df_in.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_in = df_in.drop_duplicates()
df_in = df_in.drop_duplicates(subset=['potentialOutPath'])
df_in['potentialOutPath'] = df_in['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + x)

In [5]:
pd.options.mode.chained_assignment = None
df_before['is_news'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_after['is_news'] = df_after[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_in['is_news'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)

In [7]:
# included pubDate to help us define news
after_df = pd.DataFrame(columns=['text', 'label', 'path', 'pubDate'])
in_df = pd.DataFrame(columns=['text', 'label', 'path', 'pubDate'])

In [118]:
def split_transcript(transcript):
    chunk_size = 100
    transcript['content'] = transcript['content'].fillna('').astype(str)
    text = ''.join(transcript['content'].astype(str))
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\*.*?\*', '', text)
    
    
    sentences = re.split(r'(?<=[.!?]) ', text)
    sentences = [sentence + ' ' for sentence in sentences]
    chunks = []
    current_chunk = ''

    for ind, sentence in enumerate(sentences):
        if len(current_chunk.split(' ')) + len(sentence.split(' ')) > chunk_size:
            if ind != len(sentences) - 2:
                chunks.append(current_chunk)
                current_chunk = sentence
            else:
                current_chunk += sentences[ind]  
                current_chunk += sentences[ind + 1]  
                chunks.append(current_chunk)  
                break
        else:
            current_chunk += sentence
    else:
        chunks.append(current_chunk)

    return chunks

#### Build dataframes of labeled chunks:
#### **Before:**

In [119]:
before_text = []
before_label = []
before_path = []
before_date = []

for index, row in df_before.iterrows():
    path = row["potentialOutPath"]
    if os.path.isfile(path):
        transcript = pd.read_csv(path, usecols=['content'])
        chunks = split_transcript(transcript)
        if chunks:
            before_text.extend(chunks)
            before_label.extend([row["is_news"]] * len(chunks))
            before_path.extend([path] * len(chunks))
            before_date.extend([row["pubDate"]] * len(chunks))

In [120]:
data = {
    'text': before_text,
    'label': before_label,
    'path': before_path,
    'pubDate': before_date
}

before_df = pd.DataFrame(data)

In [124]:
before_df = before_df[before_df['text'].str.strip() != '']

#### **After:**

In [125]:
after_text = []
after_label = []
after_path = []
after_date = []

for index, row in df_after.iterrows():
    path = row["potentialOutPath"]
    if os.path.isfile(path):
        transcript = pd.read_csv(path, usecols=['content'])
        chunks = split_transcript(transcript)
        if chunks:
            after_text.extend(chunks)
            after_label.extend([row["is_news"]] * len(chunks))
            after_path.extend([path] * len(chunks))
            after_date.extend([row["pubDate"]] * len(chunks))

In [126]:
data = {
    'text': after_text,
    'label': after_label,
    'path': after_path,
    'pubDate': after_date
}

after_df = pd.DataFrame(data)

In [127]:
after_df = after_df[after_df['text'].str.strip() != '']

#### **In:**
### **In the code block below, remove index <= 5000 once data collection is done**

In [128]:
in_text = []
in_label = []
in_path = []
in_date = []

for index, row in df_in.iterrows():
    if index <= 5000:
        path = row["potentialOutPath"]
        if os.path.isfile(path):
            transcript = pd.read_csv(path, usecols=['content'])
            chunks = split_transcript(transcript)
            if chunks:
                in_text.extend(chunks)
                in_label.extend([row["is_news"]] * len(chunks))
                in_path.extend([path] * len(chunks))
                in_date.extend([row["pubDate"]] * len(chunks))

In [129]:
data = {
    'text': in_text,
    'label': in_label,
    'path': in_path,
    'pubDate': in_date
}

df_test = pd.DataFrame(data)

In [130]:
df_test = df_test[df_test['text'].str.strip() != '']

In [131]:
df_train = pd.concat([before_df, after_df], ignore_index=True)

### Remove transcripts that contain non-English words

In [133]:
def is_eng(text):
    isReliable, textBytesFound, details, vectors = cld2.detect(text, returnVectors=True)
    if details[0][0] != "ENGLISH":
        return False
    else:
        return True

df_train['is_English'] = df_train['text'].apply(is_eng)
df_test['is_English'] = df_test['text'].apply(is_eng)

In [134]:
df_train = df_train[df_train['is_English'] == True]
df_test = df_test[df_test['is_English'] == True]

In [210]:
# df_before_labeled = df_before_labeled.rename(columns={"potentialOutPath": "path"})
# df_path_and_label = df_before_labeled[["path", "is_news"]]
# df_path_and_label.drop_duplicates(subset='path', keep='first', inplace=True)
# df_before.drop_duplicates(subset='path', keep='first', inplace=True)
# df_before = df_before.merge(df_path_and_label, on="path", how="left")

In [76]:
# Export the cleaned corpus to a .csv file:
# df_before.to_csv("labeled_before_transcripts.csv")

In [None]:
# selected_rows = df_for_test[df_for_test['is_English'] == 0]

In [None]:
# indices = df_before_cp[df_before_cp['is_mistranscribed']].index.tolist()
# paths = df_before_cp[df_before_cp['is_mistranscribed']]['path'].tolist()
# labels = df_before_cp[df_before_cp['is_mistranscribed']]['label'].tolist()
# texts = df_before_cp[df_before_cp['is_mistranscribed']]['content'].tolist()
# df_mistranscribed = pd.DataFrame({"index":indices, "path":paths, "label":labels, "text":texts})

In [82]:
# df_before_cp.drop(df_before_cp[~df_before_cp['is_English']].index, inplace=True)
# df_before_cp = df_before_cp[df_before_cp['text'].str.strip() != '']

In [132]:
# min_length = df_before_cp['text'].apply(len).min()
# print(min_length)
# index_of_min_length = df_before_cp['text'].apply(len).idxmin()

## 1.3 Downsampling

## **Make sure to do this once have all data collected**

In [55]:
before_df_news = before_daf[before_dfa['label'] == 1]
n_before_news = before_df_news.shape[0]
before_df_no_news = before_df[before_df['is_news'] == 0].sample(n=n_before_news*5, replace=False, random_state=1)
df_before = pd.concat([df_before_news, df_before_no_news], ignore_index=True)
df_before = df_before.sample(frac=1, random_state=387).reset_index(drop=True)

### 1.4 Split dataset into training, dev, and test sets

In [135]:
df_train, df_non_train = train_test_split(df_train, test_size=0.5)

In [137]:
df_train_train, df_train_test = train_test_split(df_train, test_size=0.2)

In [138]:
ds_non_train = Dataset.from_pandas(df_non_train[['text', 'label']])
ds_train_test = Dataset.from_pandas(df_train_test[['text', 'label']])
ds_train_train = Dataset.from_pandas(df_train_train[['text', 'label']])

### **2. Model training**
- MiniLM (microsoft/MiniLM-L12-H384-uncased)

#### **2.1 Prepare and tokenize a dataset**
**Note: max_length = 800 because the average length of transcripts is 798 tokens.**  

In [21]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, padding="max_length", truncation=True)

ds_train_dev = ds_train_dev.map(tokenize_function, batched=True, batch_size=20)
ds_test = ds_test.map(tokenize_function, batched=True, batch_size=20)

Map:   0%|          | 0/53738 [00:00<?, ? examples/s]

Map:   0%|          | 0/7677 [00:00<?, ? examples/s]

Map:   0%|          | 0/15354 [00:00<?, ? examples/s]

### **Note: May change the following parameters as per David's advice:**
- max_length=512 in tokenizer()
- per_device_train_batch_size=258 in TrainingArguments()
- num_train_epochs=3 in TrainingArguments()

#### **2.2 Model training**

In [48]:
deviceNum = 0
device = torch.device("cuda:" + str(deviceNum) if torch.cuda.is_available() else "cpu")

In [51]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2).to(device)
    
output_dir = "/shared/3/projects/bowenyi/Floyd_Month/Content Analysis/podcasts/model_output"   
seed = 1

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    weight_decay=0.01,
    do_eval=True,
    seed=seed,
    save_strategy='steps',
    save_steps=10_000,
    evaluation_strategy='steps',
    eval_steps=10_000,
    logging_dir=output_dir + 'logs/',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    run_name='podcasts-study' + str(seed)
)

trainer = Trainer(
    model_init=model_init,    
    args=training_args,
    train_dataset=ds_train_dev["train"],
    eval_dataset=ds_train_dev["test"],    
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
trainer.train()

In [44]:
trainer.evaluate(ds_test)

{'eval_loss': 0.4505256712436676,
 'eval_f1': 0.5026443833298075,
 'eval_runtime': 92.1173,
 'eval_samples_per_second': 166.679,
 'eval_steps_per_second': 1.303,
 'epoch': 10.0}

### **3. Model calibration on non-train data**
- Predict probabilities on each row & fill up "prob" column
- Build strata profile
- Annotate 10 random samples from each stratum
- Calculate F-1 for each cutoff (9 in total)
- Find the best cutoff

#### **3.1 Predict probabilities & fill up the "prob" column**

In [None]:
# prediction_inputs = df_non_train['raw_X'].tolist()
# predict_dset = Dataset.from_pandas(pd.DataFrame({"text": prediction_inputs}))

# tokenized_predict_dset = predict_dset.map(tokenize_function, batched=True)
# predictions = trainer.predict(tokenized_predict_dset)

# # Apply softmax to the model's raw predictions to get probabilities
# probs = softmax(predictions.predictions, axis=1)[:, 1]

# df_non_train.loc[:, 'prob'] = probs

#### **3.2 Build strata profile**

In [None]:
# df_non_train['stratum'] = pd.cut(df_non_train['prob'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
#                        labels=['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%'], 
#                        include_lowest=True)

In [None]:
# # Check the distribution across different strata
# df_non_train['stratum'].value_counts()

#### Original strata distribution:
- 0-10%      20534  
- 10-20%      9102
- 20-30%      3375
- 30-40%      2170
- 40-50%      1757
- 80-90%      1688
- 50-60%      1022
- 60-70%       734
- 70-80%       699
- 90-100%        0

#### **Note: Since 90-100% stratum is empty, I will increase all probabilities by 1.15% (Ask David or Ben)**

In [53]:
# df_non_train['prob'] += 0.0115 

In [54]:
# df_non_train['stratum'] = pd.cut(df_non_train['prob'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
#                        labels=['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%'], 
#                        include_lowest=True)

# df_non_train['stratum'].value_counts()

0-10%      18864
10-20%     10085
20-30%      3773
30-40%      2226
40-50%      1849
80-90%      1759
50-60%      1078
60-70%       733
70-80%       682
90-100%       32
Name: stratum, dtype: int64

#### **3.3 Annotate 10 random samples from each stratum**

In [None]:
# annotate_df = df_non_train.groupby('stratum').apply(lambda x: x.sample(n=10, random_state=1)).reset_index(drop=True)

#### **- If annotate off-line:**

In [None]:
# annotate_df.to_csv('miniLM_annotate.csv')

# # Annotate on 'miniLM_annotate.csv'
# # Update the annotated file into miniLM_annotated.csv 
# annotate_df = pd.read_csv('miniLM_annotated.csv')

#### **- If annotate on Jupyter Lab/Notebook:**

In [None]:
index = 
print(annotate_df['raw_X'][index])

In [None]:
annotate_df[index, 'real_y'] = 

#### **3.4 Calculate F-1 for each cutoff (9 in total) and find the best cutoff**

In [None]:
from sklearn.metrics import f1_score

cutoffs = [i/10 for i in range(1, 10)]
best_cutoff = None
best_f1_score = -1

for cutoff in cutoffs:
    label_pred = [1 if p >= cutoff else 0 for p in annotated_df['prob']]
    f1 = f1_score(annotated_df['real_y'], preds)
    if f1 > best_f1_score:
        best_cutoff = cutoff
        best_f1_score = f1

print(f'The optimal cutoff is {best_cutoff} which gives an F1 Score of {best_f1_score}')

#### **3.5 Compare inter-model performance and finalize model choice**
- miniLM (this file)
- logistic regression

#### Note: Helpful Code snippets

1. Put mistranscribed transcripts into a dataframe
```
indices = df_before_cp[df_before_cp['is_mistranscribed']].index.tolist()
paths = df_before_cp[df_before_cp['is_mistranscribed']]['path'].tolist()
labels = df_before_cp[df_before_cp['is_mistranscribed']]['label'].tolist()
texts = df_before_cp[df_before_cp['is_mistranscribed']]['content'].tolist()
df_mistranscribed = pd.DataFrame({"index":indices, "path":paths, "label":labels, "text":texts})
```

2. Find non-English text inside dataframe df_before_cp 
```
non_english_entries = df_before_cp.loc[~df_before_cp['is_English']]
non_english_entries['text']
```

3. Find the web URL of mistranscribed transcripts, based on its file path
```
match = df_before_labeled[df_before_labeled['path'].str.contains('/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth/sphinx.acast.com/am/httpssphinx.acast.commorgenbladetyahyahassanmedia.mp3MERGED')]
match['enclosure']
```

4. Count mistranscribed entries:
```
counts = (df["is_mistranscribed"]).sum()
```

5. Find the entry and indices under one column with the longest/shortest length
```
max_length = df_before_cp['text'].apply(len).max()
print(max_length)
index_of_max_length = df_before_cp['text'].apply(len).idxmax()
```

6. Find the number of cells under column "text" that contain fewer than 100 words:
```
df_before['content'].apply(lambda x: len(str(x).split()) < 100).sum()
```
Similarly, grab those entries:
```
filtered_df = df_before[df_before['content'].apply(lambda x: len(str(x).split()) < 128)]
```

7. Average number of lines in the directory
```
import os
from pathlib import Path
parent_dir_path = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth/"
file_count = 0
line_count = 0

for dirpath, dirnames, filenames in os.walk(parent_dir_path):
    for file in filenames:
        if file.endswith('.mp3MERGED'):
            file_path = Path(dirpath) / file
            with open(file_path, 'r') as f:
                file_line_count = sum(1 for line in f)
                if file_line_count > 0: 
                    file_count += 1
                    line_count += file_line_count

average_lines = line_count / file_count if file_count != 0 else 0
print("Average number of lines across all .txt files is: ", average_lines)
```

In [None]:
"/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth/chtbl.com/c3/httpschtbl.comtrack5899Epodtrac.comptsredirect.mp3traffic.omny.fmdclipse73c998e6e60432f8610ae210140c5b1dd54ace2063f43cf8e45ae3900375bf011b79adba22f453fb4ccae39013d3cffaudio.mp3utm_sourcePodcastin_playlistc5eecdf1cdcf4e91b49aae3900375bfeMERGED"

In [192]:
df_before_labeled.columns

Index(['Unnamed: 0', 'rssUrl', 'epTitle', 'epDescription', 'duration',
       'pubDate', 'copyright', 'itunes:type', 'itunes:complete', 'guid',
       'itunes:explicit', 'enclosure', 'itunes:image', 'transDict', 'id',
       'title', 'lastUpdate', 'link', 'lastHttpStatus', 'dead', 'contentType',
       'itunesId', 'originalUrl', 'itunesAuthor', 'itunesOwnerName',
       'explicit', 'imageUrl', 'itunesType', 'generator', 'newestItemPubdate',
       'language', 'oldestItemPubdate', 'episodeCount', 'popularityScore',
       'priority', 'createdOn', 'updateFrequency', 'chash', 'host',
       'newestEnclosureUrl', 'podcastGuid', 'podDescription', 'category1',
       'category2', 'category3', 'category4', 'category5', 'category6',
       'category7', 'category8', 'category9', 'category10',
       'newestEnclosureDuration', 'oldestItemDatetime', 'cleanDates', 'path',
       'cleanDatesLoc', 'is_news'],
      dtype='object')

In [37]:
match = df_before[df_before['potentialOutPath'].str.contains("/shared/3/projects/benlitterer/podcastData/prosodyMerged/5_4_5_10/anchor.fm/2t/httpsanchor.fms8dc3d88podcastplay13515377https3A2F2Fd3ctxlq1ktw2nl.cloudfront.net2Fproduction2F2020492F716280684410016102979586261.m4aMERGED")]
match['enclosure']

13    https://anchor.fm/s/8dc3d88/podcast/play/13515377/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fproduction%2F2020-4-9%2F71628068-44100-1-6102979586261.m4a
Name: enclosure, dtype: object

In [160]:
news_df = df_before[df_before['is_news'] == 1].copy()

In [161]:
news_df.shape

(16180, 3)

In [248]:
filtered_df = df_before[df_before['content'].apply(lambda x: len(str(x).split()) > 120 and len(str(x).split()) < 400)].copy()

In [249]:
selected_rows = filtered_df[filtered_df['is_news'] == 1]

In [250]:
df_before.shape

(171495, 3)

In [251]:
selected_rows.shape

(11238, 3)