In [2]:
import pandas as pd
import os
import sys
import numpy as np
import re
import nltk
import contractions
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add('u')
stopwords.add('th')
sys.path.append("/home/bowenyi/.local/lib/python3.11/site-packages")

2024-01-21 03:41:21.325585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Defaulting to user installation because normal site-packages is not writeable
[0m

### **1. Preprocess dataframes**

In [3]:
df_before = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/beforeFloydMonth/beforeFMonth.tsv", lineterminator = '\n', low_memory=False)
df_in = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEnSHORT.csv", lineterminator = '\n', low_memory=False)

In [4]:
df_before = df_before.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_before = df_before.drop_duplicates()
df_before = df_before.drop_duplicates(subset=['potentialOutPath'])

df_in = df_in.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_in = df_in.drop_duplicates()
df_in = df_in.drop_duplicates(subset=['potentialOutPath'])

df_before['potentialOutPath'] = df_before['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth" + x)
df_in['potentialOutPath'] = df_in['potentialOutPath'].apply(lambda x: "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + x)

#### **1.1 Introduce an is_news column**

In [5]:
pd.options.mode.chained_assignment = None
df_before['is_news'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_in['is_news'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)

#### **1.2 Downsample the training data to 4:1 distribution**
- Training data are df_before_news and df_after_news (unavailable yet)
- non-news : news = 4 : 1

In [6]:
df_before_news = df_before[df_before['is_news'] == 1]
n_before_news = df_before_news.shape[0]
df_before_no_news = df_before[df_before['is_news'] == 0].sample(n=n_before_news*4, replace=False, random_state=387)
df_before_news = pd.concat([df_before_news, df_before_no_news], ignore_index=True)
df_before_news = df_before_news.sample(frac=1, random_state=387).reset_index(drop=True)

### **2. Obtain the training and non-training data**
- df_train: dataframe for training data
- df_non_train: dataframe for non-training data

In [6]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    text = re.sub(r"\n", "", text)  # remove line breaks
    text = re.sub(r'\[.*?\]', '', text)  # remove [Music], (Audio), etc.
    text = re.sub(r'\(.*?\)', '', text)
    text = text.lower()    # convert to lowercase
    text = re.sub(r'\b\w+\.com\b', '', text)  # remove something.com
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   # remove hyperlinks
    text = re.sub(r"\d+", "", text)   # remove digits and currencies 
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   # remove dates
    text = re.sub(r'[^\x00-\x7f]', r' ', text)  # remove non-ascii
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]', '', text)   # remove punctuation

    filtered_tokens = [word for word in word_tokenize(text) if not word in stopwords]
    pos_tags = pos_tag(filtered_tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags if lemmatizer.lemmatize(word, get_wordnet_pos(tag)) not in stopwords]
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text


def split_transcript(transcript):
    transcript['content'] = transcript['content'].fillna('').astype(str)
    start_time = 0
    chunks = []
    chunk = ""
    end_of_sentence = ['.', '!', '?', ']', ')']

    for index, row in transcript.iterrows():
        content = str(row['content'])
        if content.strip() != '':
            if row['end'] - start_time < 60:
                chunk += content
            else:
                chunk += content
                if any(ele in content for ele in end_of_sentence):
                    chunk = preprocess_text(chunk)
                    if chunk.strip() != '':
                        chunks.append(chunk)
                        start_time = row['end']
                    chunk = ""

    if len(chunk) != 0:
        chunk = preprocess_text(chunk)
        if chunk.strip() != '':
            chunks.append(chunk)

    return chunks

In [None]:
df_news = pd.DataFrame(columns=['X', 'y', 'real_y', 'prob', 'path'])
for index, row in df_before_news.iterrows():
    path = row["potentialOutPath"]
    if os.path.isfile(path):
        transcript = pd.read_csv(path, usecols=['start', 'end', 'content'])
        X = split_transcript(transcript)
        y = row['is_news']
        real_y = -1
        prob = -1
        df_news.loc[index] = [X, y, real_y, prob, path]

df_news = df_news.reset_index(drop=True)

In [None]:
df_news = df_news[df_news['X'].apply(lambda x: len(x) > 0)]
df_train, df_non_train = train_test_split(df_news, test_size=0.5, random_state=1)
df_train = df_train.reset_index(drop=True)
df_non_train = df_non_train.reset_index(drop=True)

### **3. Model training**
- Logistic regression (l2-loss, min_df = 0.025)
- Unigrams and bigrams

In [None]:
X_train, y_train = [], []

for index, row in df_train.iterrows():
    for chunk in row['X']:
        X_train.append(chunk)
    y_train.extend([row['y']] * len(row['X']))

In [None]:
from sklearn.linear_model import LogisticRegression

vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=0.025)
X_train = vectorizer.fit_transform(X_train)
log_reg = LogisticRegression(random_state=1).fit(X_train, y_train)

### **4. Model calibration on non-train data**
- Predict probabilities on each row, store output probabilities into "prob"
- Build strata profile
- Annotate 10 random samples from each stratum
- Calculate F-1 for each cutoff (9 in total)
- Find the best cutoff

#### **4.1 Predict probabilities & fill up the "prob" column**

In [None]:
X = vectorizer.transform(df_non_train['X'])
y = log_reg.predict_proba(X)
df_non_train.loc[:, 'prob'] = y[:, 1]

#### **4.2 Build strata profile**

In [None]:
df_non_train['stratum'] = pd.cut(df_non_train['prob'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
                       labels=['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%'], 
                       include_lowest=True)

In [None]:
# Check the distribution across different strata
df_non_train['stratum'].value_counts()

In [None]:
# Sort the DataFrame according to the order of strata
df_non_train = df_non_train.sort_values(by='stratum')

#### **4.3 Annotate 10 random samples from each stratum**

In [None]:
annote_df = df_non_train.groupby('stratum').apply(lambda x: x.sample(n=10, random_state=1)).reset_index(drop=True)

#### **4.4 Calculate F-1 for each cutoff (9 in total)**

#### **4.5 Find the best cutoff and compare the inter-model performance**
- logistic regression (this file)
- miniLM