In [1]:
import pandas as pd
import sys
import os
import re
import nltk
import contractions
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet
from transformers import TrainingArguments, Trainer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add('u')
stopwords.add('th')
sys.path.append("/home/bowenyi/.local/lib/python3.11/site-packages")
!pip install prettytable
!pip install -U sentence-transformers

2024-01-20 16:25:13.440908: I tensorflow/core/platform/cpu_feature_guard.cc:181] Beginning TensorFlow 2.15, this package will be updated to install stock TensorFlow 2.15 alongside Intel's TensorFlow CPU extension plugin, which provides all the optimizations available in the package and more. If a compatible version of stock TensorFlow is present, only the extension will get installed. No changes to code or installation setup is needed as a result of this change.
More information on Intel's optimizations for TensorFlow, delivered as TensorFlow extension plugin can be viewed at https://github.com/intel/intel-extension-for-tensorflow.
[nltk_data] Downloading package punkt to /home/bowenyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bowenyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bowenyi/nltk_data...
[nltk_data]   Package stopword

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### **1. Preprocess dataframes**

In [2]:
df_before = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/beforeFloydMonth/beforeFMonth.tsv", lineterminator = '\n', low_memory=False)
df_in = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEnSHORT.csv", lineterminator = '\n', low_memory=False)

In [3]:
df_before = df_before.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_before = df_before.drop_duplicates()
df_before = df_before.drop_duplicates(subset=['potentialOutPath'])

df_in = df_in.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_in = df_in.drop_duplicates()
df_in = df_in.drop_duplicates(subset=['potentialOutPath'])

df_before['potentialOutPath'] = df_before['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth" + x)
df_in['potentialOutPath'] = df_in['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + x)

#### **1.1 Introduce an is_news column**

In [4]:
pd.options.mode.chained_assignment = None
df_before['is_news'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_in['is_news'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)

#### **1.2 Downsample the training data to 4:1 distribution**
- Training data are df_before_news and df_after_news (unavailable yet)
- non-news : news = 4 : 1

In [5]:
df_before_news = df_before[df_before['is_news'] == 1]
n_before_news = df_before_news.shape[0]
df_before_no_news = df_before[df_before['is_news'] == 0].sample(n=n_before_news*4, replace=False, random_state=387)
df_before_news = pd.concat([df_before_news, df_before_no_news], ignore_index=True)
df_before_news = df_before_news.sample(frac=1, random_state=387).reset_index(drop=True)

### **2. Obtain the training and non-training data**
- df_train: dataframe for training data
- df_non_train: dataframe for non-training data

In [6]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    text = re.sub(r"\n", "", text)  # remove line breaks
    text = re.sub(r'\[.*?\]', '', text)  # remove [Music], (Audio), etc.
    text = re.sub(r'\(.*?\)', '', text)
    text = text.lower()    # convert to lowercase
    text = re.sub(r'\b\w+\.com\b', '', text)  # remove something.com
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   # remove hyperlinks
    text = re.sub(r"\d+", "", text)   # remove digits and currencies 
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   # remove dates
    text = re.sub(r'[^\x00-\x7f]', r' ', text)  # remove non-ascii
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]', '', text)   # remove punctuation

    filtered_tokens = [word for word in word_tokenize(text) if not word in stopwords]
    pos_tags = pos_tag(filtered_tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags if lemmatizer.lemmatize(word, get_wordnet_pos(tag)) not in stopwords]
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text


def split_transcript(transcript):
    transcript['content'] = transcript['content'].fillna('').astype(str)
    start_time = 0
    chunks = []
    chunk = ""
    end_of_sentence = ['.', '!', '?', ']', ')']

    for index, row in transcript.iterrows():
        content = str(row['content'])
        if content.strip() != '':
            if row['end'] - start_time < 60:
                chunk += content
            else:
                chunk += content
                if any(ele in content for ele in end_of_sentence):
                    chunk = preprocess_text(chunk)
                    if chunk.strip() != '':
                        chunks.append(chunk)
                        start_time = row['end']
                    chunk = ""

    if len(chunk) != 0:
        chunk = preprocess_text(chunk)
        if chunk.strip() != '':
            chunks.append(chunk)

    return chunks

In [7]:
def split_transcript_miniLM(transcript):
    transcript['content'] = transcript['content'].fillna('').astype(str)
    text = ''
    for ind, row in transcript.iterrows():
        text += str(row['content'])
    return text

In [8]:
df_news = pd.DataFrame(columns=['raw_X', 'chunked_X', 'y', 'real_y', 'prob', 'path'])
for index, row in df_before_news.iterrows():
    path = row["potentialOutPath"]
    if os.path.isfile(path):
        transcript = pd.read_csv(path, usecols=['start', 'end', 'content'])
        raw_X = split_transcript_miniLM(transcript)
        chunked_X = split_transcript(transcript)
        y = row['is_news']
        real_y = -1
        prob = -1
        df_news.loc[index] = [raw_X, chunked_X, y, real_y, prob, path]

df_news = df_news.reset_index(drop=True)

In [9]:
df_news = df_news[df_news['chunked_X'].apply(lambda x: len(x) > 0)]
df_train, df_non_train = train_test_split(df_news, test_size=0.5, random_state=1)
df_train = df_train.reset_index(drop=True)
df_non_train = df_non_train.reset_index(drop=True)

### **3. Train two models**
- MiniLM
- Logistic regression (l2-loss, min_df = 1000 or so)

#### **3.1 MiniLM**

In [12]:
df_train_miniLM = pd.DataFrame(columns=['label', 'text'])
for index, row in df_train.iterrows():
    df_train_miniLM.loc[index] = [row['y'], row['raw_X']]

ds_train_miniLM = Dataset.from_pandas(df_train_miniLM)
ds_train_miniLM = ds_train_miniLM.remove_columns(["__index_level_0__"])

### **Note: Once memory is more free, change per_device_train_batch_size and epoch as per David's advice**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
miniLM = AutoModelForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


tokenized_datasets = ds_train_miniLM.map(tokenize_function, batched=True)
training_args = TrainingArguments(output_dir="test_trainer", per_device_train_batch_size=128, num_train_epochs=3)
trainer = Trainer(
    model=miniLM,
    args=training_args,
    train_dataset=tokenized_datasets
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/39694 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


#### **3.2 Logistic regression**
- Trained on unigrams and bigrams

In [None]:
X_train, y_train = [], []

for index, row in df_train.iterrows():
    for chunk in row['chunked_X']:
        X_train.append(chunk)
    y_train.extend([row['y']] * len(row['chunked_X']))

In [None]:
from sklearn.linear_model import LogisticRegression

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=0.025)
X_train = vectorizer.fit_transform(X_train)
log_reg = LogisticRegression(random_state=1).fit(X_train, y_train)

### **4. Model calibration on non-train data**

#### **4.1 Mini-LM**
- Predict probabilities on each row
- Build strata profile
- Annotate 10 random samples from each stratum
- Calculate F-1 for each cutoff (9 in total)
- Find the best cutoff

In [None]:
df_non_train.rename(columns={'prob':'prob_miniLM'}, inplace=True)
df_non_train['prob_log_reg'] = -1

#### **4.1.1 Predict probabilities on each row & fill in the probability score column**

In [None]:
from torch.nn.functional import softmax

prediction_inputs = df_non_train['raw_X'].tolist()
predict_dset = Dataset.from_pandas(pd.DataFrame({"text": prediction_texts}))
predict_dset = predict_dset.remove_columns(["__index_level_0__"])

tokenized_predict_dset = predict_dset.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_predict_dset)

# Apply softmax to the model's raw predictions to get probabilities
probs = softmax(predictions.predictions, axis=1)[:, 1]

df_non_train['prob_miniLM'] = probs

#### **4.1.2 Build strata profile**

#### **4.2 Logistic Regression**
- Predict probabilities on each row, store output probabilities into "prob_log_reg"
- Build strata profile
- Annotate 10 random samples from each stratum
- Calculate F-1 for each cutoff (9 in total)
- Find the best cutoff

#### **4.1.1 Predict probabilities on each row & fill in the probability score column**

In [None]:
X = vectorizer.transform(df_non_train['text'])
y = log_reg.predict_proba(X)
df_non_train['prob_log_reg'] = probs[:, 1]