In [1]:
!pip install -U sentence-transformers
import pandas as pd
import torch
import numpy as np
import os
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

Defaulting to user installation because normal site-packages is not writeable


2024-01-21 00:27:16.704285: I tensorflow/core/platform/cpu_feature_guard.cc:181] Beginning TensorFlow 2.15, this package will be updated to install stock TensorFlow 2.15 alongside Intel's TensorFlow CPU extension plugin, which provides all the optimizations available in the package and more. If a compatible version of stock TensorFlow is present, only the extension will get installed. No changes to code or installation setup is needed as a result of this change.
More information on Intel's optimizations for TensorFlow, delivered as TensorFlow extension plugin can be viewed at https://github.com/intel/intel-extension-for-tensorflow.


### **1. Preprocess dataframes**

In [8]:
df_before = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/beforeFloydMonth/beforeFMonth.tsv", lineterminator = '\n', low_memory=False)
df_in = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEnSHORT.csv", lineterminator = '\n', low_memory=False)

df_before = df_before.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_before = df_before.drop_duplicates()
df_before = df_before.drop_duplicates(subset=['potentialOutPath'])

df_in = df_in.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_in = df_in.drop_duplicates()
df_in = df_in.drop_duplicates(subset=['potentialOutPath'])

df_before['potentialOutPath'] = df_before['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth" + x)
df_in['potentialOutPath'] = df_in['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + x)

#### **1.1 Introduce an is_news column**

In [9]:
pd.options.mode.chained_assignment = None
df_before['is_news'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_in['is_news'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)

#### **1.2 Downsample the training data to 4:1 distribution**
- Training data are df_before_news and df_after_news (unavailable yet)
- non-news : news = 4 : 1

In [10]:
df_before_news = df_before[df_before['is_news'] == 1]
n_before_news = df_before_news.shape[0]
df_before_no_news = df_before[df_before['is_news'] == 0].sample(n=n_before_news*4, replace=False, random_state=387)
df_before_news = pd.concat([df_before_news, df_before_no_news], ignore_index=True)
df_before_news = df_before_news.sample(frac=1, random_state=387).reset_index(drop=True)

### **2. Obtain the training and non-training data**
- df_train: dataframe for training data
- df_non_train: dataframe for non-training data

In [11]:
def split_transcript(transcript):
    transcript['content'] = transcript['content'].fillna('').astype(str)
    text = ''
    for ind, row in transcript.iterrows():
        text += str(row['content'])
    return text

In [None]:
df_news = pd.DataFrame(columns=['X', 'y', 'real_y', 'prob', 'path'])
for index, row in df_before_news.iterrows():
    path = row["potentialOutPath"]
    if os.path.isfile(path):
        transcript = pd.read_csv(path, usecols=['start', 'end', 'content'])
        X = split_transcript(transcript)
        y = row['is_news']
        real_y = -1
        prob = -1
        df_news.loc[index] = [X, y, real_y, prob, path]

df_news = df_news.reset_index(drop=True)

In [None]:
df_news = df_news[df_news['X'].apply(lambda x: len(x) > 0)]
df_train, df_non_train = train_test_split(df_news, test_size=0.5, random_state=1)
df_train = df_train.reset_index(drop=True)
df_non_train = df_non_train.reset_index(drop=True)

### **3. Model training**
- MiniLM (microsoft/MiniLM-L12-H384-uncased)

In [None]:
df_train_miniLM = pd.DataFrame(columns=['label', 'text'])
for index, row in df_train.iterrows():
    df_train_miniLM.loc[index] = [row['y'], row['X']]

ds_train_miniLM = Dataset.from_pandas(df_train_miniLM)
ds_train_miniLM = ds_train_miniLM.remove_columns(["__index_level_0__"])

### **Note: May change the following parameters as per David's advice:**
- max_length=512 in tokenizer()
- per_device_train_batch_size=64 in TrainingArguments()
- num_train_epochs=3 in TrainingArguments()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
miniLM = AutoModelForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


tokenized_datasets = ds_train_miniLM.map(tokenize_function, batched=True)
training_args = TrainingArguments(output_dir="test_trainer", per_device_train_batch_size=64, num_train_epochs=3)
trainer = Trainer(
    model=miniLM,
    args=training_args,
    train_dataset=tokenized_datasets
)
trainer.train()

### **4. Model calibration on non-train data**
- Predict probabilities on each row & fill up "prob" column
- Build strata profile
- Annotate 10 random samples from each stratum
- Calculate F-1 for each cutoff (9 in total)
- Find the best cutoff

#### **4.1 Predict probabilities & fill up the "prob" column**

In [None]:
from torch.nn.functional import softmax

prediction_inputs = df_non_train['X'].tolist()
predict_dset = Dataset.from_pandas(pd.DataFrame({"text": prediction_inputs}))
predict_dset = predict_dset.remove_columns(["__index_level_0__"])

tokenized_predict_dset = predict_dset.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_predict_dset)

# Apply softmax to the model's raw predictions to get probabilities
probs = softmax(predictions.predictions, axis=1)[:, 1]

df_non_train.loc[:, 'prob'] = probs

#### **4.2 Build strata profile**

In [None]:
df_non_train['stratum'] = pd.cut(df_non_train['prob'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
                       labels=['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%'], 
                       include_lowest=True)

In [None]:
# Check the distribution across different strata
df_non_train['stratum'].value_counts()

In [None]:
# Sort the DataFrame according to the order of strata
df_non_train = df_non_train.sort_values(by='stratum')

#### **4.3 Annotate 10 random samples from each stratum**

In [None]:
annote_df = df_non_train.groupby('stratum').apply(lambda x: x.sample(n=10, random_state=1)).reset_index(drop=True)

#### **4.4 Calculate F-1 for each cutoff (9 in total)**

#### **4.5 Find the best cutoff and compare inter-model performance**
- miniLM (this file)
- logistic regression