#### In this notebook, I will prepare the train, validation, and test sets for logistic regression and naive bayes models. I will train them on (unigrams, bigrams) and (bigrams, trigrams) and measure their performance on accuracy and F-1 score. 

#### Our class will be made balanced deliberately. 

In [36]:
import pandas as pd
import sys
sys.path.append("/home/bowenyi/.local/lib/python3.11/site-packages")
!pip install prettytable

[nltk_data] Downloading package punkt to /home/bowenyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bowenyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bowenyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [189]:
df_before = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/beforeFloydMonth/beforeFMonth.tsv", lineterminator = '\n', low_memory=False)
df_in = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEnSHORT.csv", lineterminator = '\n', low_memory=False)

##### *Preprocessing the dataframes by dropping duplicate rows*:

In [190]:
df_before = df_before.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_before = df_before.drop_duplicates()
df_before = df_before.drop_duplicates(subset=['potentialOutPath'])

df_in = df_in.dropna(subset=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'], how='all')
df_in = df_in.drop_duplicates()
df_in = df_in.drop_duplicates(subset=['potentialOutPath'])

##### Introduce two columns that represent data labels: is_news and is_politics. 

In [203]:
'''
Add two columns to the existing dataframes: "is_news", "is_politics". 

is_news and is_politics take the value of 1 or 0. 

is_politics is 1 if "politics" or "government" is one of the categories. According to Ashwin's 2021 paper 
Political Discussion is Abundant..., a podcast is political if it is about political institution (definition modified). 
Thus, if a podcast is categorized as "government", it is political.
'''
pd.options.mode.chained_assignment = None  # default='warn'
df_before['is_news'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_before['is_politics'] = df_before[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if ('politics' in x.values or 'government' in x.values) else 0, axis=1)

df_in['is_news'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_in['is_politics'] = df_in[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if ('politics' in x.values or 'government' in x.values) else 0, axis=1)

#TODO: Add two columns to df_after
df_after['is_news'] = df_after[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if 'news' in x.values else 0, axis=1)
df_after['is_politics'] = df_after[['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10']].apply(lambda x: 1 if ('politics' in x.values or 'government' in x.values) else 0, axis=1)

'\nAdd two columns to df_after\n'

In [207]:
'''
2. Set the ratio for politics/news vs. non-politics/news to 1:1. 
'''
# Before Floyd Month:
#is_politics:
df_before_pol = df_before[df_before['is_politics'] == 1].copy()
n_before_pol = df_before_pol.shape[0]
df_before_no_pol = df_before[df_before['is_politics'] == 0].sample(n=n_before_pol, replace=False, random_state=387).copy()
#is_news:
df_before_news = df_before[df_before['is_news'] == 1].copy()
n_before_news = df_before_news.shape[0]
df_before_no_news = df_before[df_before['is_news'] == 0].sample(n=n_before_news, replace=False, random_state=387).copy()

# In Floyd Month:
#is_politics:
df_in_pol = df_in[df_in['is_politics'] == 1].copy()
n_in_pol = df_in_pol.shape[0]
df_in_no_pol = df_in[df_in['is_politics'] == 0].sample(n=n_in_pol, replace=False, random_state=387).copy()
#is_news:
df_in_news = df_in[df_in['is_news'] == 1].copy()
n_in_news = df_in_news.shape[0]
df_in_no_news = df_in[df_in['is_news'] == 0].sample(n=n_in_news, replace=False, random_state=387).copy()

# TODO: After Floyd Month 
# df_after_pol = df_after[df_after['is_politics'] == 1].copy()
# n_after_pol = df_after_pol.shape[0]
# df_after_no_pol = df_after[df_after['is_politics'] == 0].sample(n=n_after_pol, replace=False, random_state=387).copy()
# #is_news:
# df_after_news = df_after[df_after['is_news'] == 1].copy()
# n_after_news = df_after_news.shape[0]
# df_after_no_news = df_after[df_after['is_news'] == 0].sample(n=n_after_news, replace=False, random_state=387).copy()

'''
Two test sets, for each of politics and news 
'''
# Politics:
df_train_before_pol = pd.concat([df_before_pol, df_before_no_pol], ignore_index=True)
df_train_before_pol = df_train_before_pol.sample(frac=1, random_state=387).reset_index(drop=True)

df_train_after_pol = pd.concat([df_after_pol, df_after_no_pol], ignore_index=True)
df_train_after_pol = df_train_after_pol.sample(frac=1, random_state=387).reset_index(drop=True)

# The second test set inFMonth
df_test_in_pol = pd.concat([df_in_pol, df_in_no_pol], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)


# News:
df_train_before_news = pd.concat([df_before_news, df_before_no_news], ignore_index=True)
df_train_before_news = df_train_before_news.sample(frac=1, random_state=387).reset_index(drop=True)

df_train_after_news = pd.concat([df_after_news, df_after_no_news], ignore_index=True)
df_train_after_news = df_train_after_news.sample(frac=1, random_state=387).reset_index(drop=True)
# The second test set inFMonth
df_test_in_news = pd.concat([df_in_news, df_in_no_news], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)


# train_pol, dev_test_pol = train_test_split(df_train_pol, test_size=0.3, random_state=387)
# dev_pol, test_before_after_pol = train_test_split(dev_test_pol, test_size=2/3, random_state=387)
# test_in_pol = pd.concat([df_in_pol, df_in_no_pol], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)

# train_news, dev_test_news = train_test_split(df_train_news, test_size=0.3, random_state=387)
# dev_news, test_before_after_news = train_test_split(dev_test_news, test_size=2/3, random_state=387)
# test_in_news = pd.concat([df_in_news, df_in_no_news], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)

'''
Method 2 (undecided): Cross-validation
Maintain 15% data as heldout sets for is_news and is_politics. The rest 85% are for cross-validation. 
Each classifier has two test/heldout sets: test_cv_pol, test_inFMonth_pol | test_cv_news, test_inFMonth_news
'''
# # politics:
# df_train_pol = pd.concat([df_before_pol, df_before_no_pol, df_after_pol, df_after_no_pol], ignore_index=True)
# df_train_pol = df_train_pol.sample(frac=1, random_state=387).reset_index(drop=True)
# train_pol, test_cv_pol = train_test_split(df_train_pol, test_size=0.15, random_state=387)
# test_inFMonth_pol = pd.concat([df_in_pol, df_in_no_pol], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)

# # news:
# df_train_news = pd.concat([df_before_news, df_before_no_news, df_after_news, df_after_no_news], ignore_index=True)
# df_train_news = df_train_news.sample(frac=1, random_state=387).reset_index(drop=True)
# train_news, test_cv_news = train_test_split(df_train_news, test_size=0.15, random_state=387)
# test_inFMonth_news = pd.concat([df_in_news, df_in_no_news], ignore_index=True).sample(frac=1, random_state=387).reset_index(drop=True)



'\nMethod 2 (undecided): Cross-validation\nMaintain 15% data as heldout sets for is_news and is_politics. The rest 85% are for cross-validation. \nEach classifier has two test/heldout sets: test_cv_pol, test_inFMonth_pol | test_cv_news, test_inFMonth_news\n'

In [416]:
'''
Cross-Validation pipeline for is_news and is_politics: 
1. Split the dataset (non-inFMonth) into 85% for cross-validation (cv) and 15% for test. 
2. Intiate three models (Logistic Regression, Multinomial Naive Bayes, SVM) and a range parameters (RandomSearch)
3. Pick the model and their parameters with the best performance on cv  
3. Train the model on the entire 85% dataset that we use for cv
4. Evaluate the model performance on two test sets: test_cv (15% of the dataset) and test_inFMonth (inFMonth data)
'''
# Helper 1: get the file path for beforeFloydMonth data
def get_path_before(potentialOutPath):
    rootPath = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/beforeFMonth"
    return rootPath + potentialOutPath

# Helper 2: get the file path for FloydMonth data
def get_path_in(potentialOutPath):
    rootPath = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth"
    return rootPath + potentialOutPath

'''
TODO Helper 3: get the file path for afterFloydMonth data
'''
def get_path_after(potentialOutPath):
    # rootPath = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/afterFMonth"
    # return rootPath + potentialOutPath
    pass

#Helper 4: Preprocess text 
import re
import os
import nltk
import contractions
from urllib.parse import urlparse
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add('u')

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN

def preprocess_text(text):
    text = re.sub(r"\n","",text)   #remove line breaks
    text = re.sub(r'\[.*?\]', '', text) # remove [Music], (Audio), etc.
    text = re.sub(r'\(.*?\)', '', text)
    text = text.lower()    #convert to lowercase
    text = re.sub(r'\b\w+\.com\b', '', text) #remove something.com
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   #remove hyperlinks
    text = re.sub(r"\d+","",text)   #remove digits and currencies 
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   #remove dates 
    text = re.sub(r'[^\x00-\x7f]',r' ',text)   #remove non-ascii
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]','',text)   #remove punctuation
    
    filtered_tokens = [word for word in word_tokenize(text) if not word in stopwords]
    # Lemmatization
    pos_tags = pos_tag(filtered_tokens)  
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags if lemmatizer.lemmatize(word, get_wordnet_pos(tag)) not in stopwords]
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text
    
def split_transcript(transcript):
    transcript['content'] = transcript['content'].fillna('').astype(str)
    start_time = 0
    chunks = [] # List of lists. Each sub-list is a string of chunked text  
    chunk = ""
    end_of_sentence = ['.', '!', '?', ']', ')']
    
    for index, row in transcript.iterrows():
        content = str(row['content'])
        if content.strip() != '':
            if row['end'] - start_time < 60:
                chunk += content
            else:
                chunk += content
                if any(ele in content for ele in end_of_sentence): # if we reach the end of a sentence
                    chunk = preprocess_text(chunk)
                    if chunk.strip() != '': # ignore chunks that are only white spaces or empty
                        list = []
                        list.append(chunk)
                        chunks.append(list)
                        start_time = row['end']
                    chunk = ""           

    if len(chunk) != 0:
        chunk = preprocess_text(chunk)
        if chunk.strip() != '':
            list = []
            list.append(chunk)
            chunks.append(list)
    
    return chunks


[nltk_data] Downloading package punkt to /home/bowenyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bowenyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bowenyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bowenyi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
'''
4. Prepare train, validate, test sets: is_pol and is_news
'''
import os

x_before_after_pol = []  
y_before_after_pol = [] 
x_before_after_news = []  
y_before_after_news = [] 

x_in_pol = []
y_in_pol = []
x_in_news = []
y_in_news = []

# Read training data
# is_pol
for index, row in df_train_before_pol.iterrows():
    transcript_path = get_path_before(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_before_after_pol.extend(chunks)
        y_before_after_pol.extend([row["is_politics"]] * len(chunks))

for index, row in df_train_after_pol.iterrows():
    transcript_path = get_path_after(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_before_after_pol.extend(chunks)
        y_before_after_pol.extend([row["is_politics"]] * len(chunks))
        
# is_news: 
for index, row in df_train_before_news.iterrows():
    transcript_path = get_path_before(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_before_after_news.extend(chunks)
        y_before_after_news.extend([row["is_news"]] * len(chunks))

for index, row in df_train_after_news.iterrows():
    transcript_path = get_path_after(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_before_after_news.extend(chunks)
        y_before_after_news.extend([row["is_news"]] * len(chunks))
             
# Read second test set (in FMonth)
for index, row in df_test_in_pol.iterrows():  
    transcript_path = get_path_in(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_in_pol.extend(chunks)
        y_in_pol.extend([row["is_politics"]] * len(chunks))

for index, row in df_test_in_news.iterrows():  
    transcript_path = get_path_in(row["potentialOutPath"])
    if os.path.isfile(transcript_path):
        transcript = pd.read_csv(transcript_path, usecols=['start', 'end', 'content'])
        chunks = split_transcript(transcript)
        x_in_news.extend(chunks)
        y_in_news.extend([row["is_news"]] * len(chunks))

In [None]:
'''
5. Split data into train, validation, and test sets for is_pol and is_news.

Note: we have two test sets. x_in_news/pol and x_test_news/pol. 
'''
#is_pol:
x_train_pol, x_other_pol, y_train_pol, y_other_pol = train_test_split(
    x_before_after_pol, y_before_after_pol, test_size=0.3, random_state=387)

x_dev_pol, x_test_pol, y_dev_pol, y_test_pol = train_test_split(
    x_other_pol, y_other_pol, test_size=(2/3), random_state=387)

#is_news:
x_train_news, x_other_news, y_train_news, y_other_news = train_test_split(
    x_before_after_news, y_before_after_news, test_size=0.3, random_state=387)

x_dev_news, x_test_news, y_dev_news, y_test_news = train_test_split(
    x_other_news, y_other_news, test_size=(2/3), random_state=387)

'''
is_pol: 
- train: x_train_pol, y_train_pol
- dev: x_dev_pol, y_dev_pol
- test: x_test_pol, y_test_pol; x_in_pol, y_in_pol 


is_news: 
- train: x_train_news, y_train_news
- dev: x_dev_news, y_dev_news
- test: x_test_news, y_test_news; x_in_news, y_in_news
'''

In [None]:
# Transform the data for training
# Ngrams = (1, 2) and (2, 3)
from sklearn.feature_extraction.text import CountVectorizer

count_vect_12 = CountVectorizer(ngram_range=(1, 2))
count_vect_23 = CountVectorizer(ngram_range=(2, 3))

#is_pol:
#ngrams=(1,2):
x_train_12_pol = count_vect_12.fit_transform(x_train_pol)
x_dev_12_pol  = count_vect_12.transform(x_dev_pol)
x_test_12_pol  = count_vect_12.transform(x_test_pol)
x_in_12_pol = count_vect_12.transform(x_in_pol)

#ngrams=(2,3):
x_train_23_pol = count_vect_23.fit_transform(x_train_pol)
x_dev_23_pol  = count_vect_23.transform(x_dev_pol)
x_test_23_pol  = count_vect_23.transform(x_test_pol)
x_in_23_pol = count_vect_23.transform(x_in_pol)


#is_news:
#ngrams=(1,2):
x_train_12_news = count_vect_12.fit_transform(x_train_news)
x_dev_12_news  = count_vect_12.transform(x_dev_news)
x_test_12_news  = count_vect_12.transform(x_test_news)
x_in_12_news = count_vect_12.transform(x_in_news)

#ngrams=(2,3):
x_train_23_news = count_vect_23.fit_transform(x_train_news)
x_dev_23_news  = count_vect_23.transform(x_dev_news)
x_test_23_news  = count_vect_23.transform(x_test_news)
x_in_23_news = count_vect_23.transform(x_in_news)

In [None]:
# Store variables so that they can be accessed in other documents
'''
is_pol: 
- train: x_train_pol, y_train_pol
- dev: x_dev_pol, y_dev_pol
- test: x_test_pol, y_test_pol; x_in_pol, y_in_pol 


is_news: 
- train: x_train_news, y_train_news
- dev: x_dev_news, y_dev_news
- test: x_test_news, y_test_news; x_in_news, y_in_news
'''
%store x_train_pol
%store y_train_pol
%store x_dev_pol
%store y_dev_pol
%store x_test_pol
%store y_test_pol
%store x_in_pol
%store y_in_pol
    
%store x_train_news
%store y_train_news
%store x_dev_news
%store y_dev_news
%store x_test_news
%store y_test_news
%store x_in_news
%store y_in_news

#is_pol
#ngrams=(1,2)
%store x_train_12_pol 
%store x_dev_12_pol  
%store x_test_12_pol 
%store x_in_12_pol

#ngrams=(2,3):
%store x_train_23_pol 
%store x_dev_23_pol  
%store x_test_23_pol  
%store x_in_23_pol 

#is_news:
#ngrams=(1,2):
%store x_train_12_news 
%store x_dev_12_news  
%store x_test_12_news  
%store x_in_12_news 

#ngrams=(2,3):
%store x_train_23_news 
%store x_dev_23_news 
%store x_test_23_news  
%store x_in_23_news 