# Current Embeddings of Arguments and Counterarguments

In [9]:
# General imports
import os
import re
from enum import Enum
from typing import Optional
from ctypes import Union
import numpy as np
import pandas as pd

### OpenAI Setup

In [10]:
pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import openai
from openai import OpenAI

In [12]:
client = OpenAI()

### Class Declarations

In [13]:
# Enum for categories

class Category(Enum):
    CULTURE = "culture"
    DIGITAL_FREEDOMS = "digital-freedoms"
    ECONOMY = "economy"
    EDUCATION = "education"
    ENVIRONMENT = "environment"
    FREE_SPEECH_DEBATE = "free-speech-debate"
    HEALTH = "health"
    INTERNATIONAL = "international"
    LAW = "law"
    PHILOSOPHY = "philosophy"
    POLITICS = "politics"
    RELIGION = "religion"
    SCIENCE = "science"
    SOCIETY = "society"
    SPORT = "sport" 

In [14]:
# Enum for analysis types

class AnalysisType(Enum):
    TSNE = "tsne"
    PCA = "pca"

In [15]:
# Enum for processing unit

class ProcessingUnit(Enum):
    GLOBAL = "global"
    CATEGORY = "category"
    DEBATE = "debate"

## Extract Arguments from File

#### [Debate] Arguments dict

In [16]:

def debate_extract_arguments(
    """ Extract arguments from category file: debate_topic.txt -> full.txt
    """
    category: Category,
    file_path: str,
    start_re: str = "# PRO",
    end_re: str = "# LITERATURE",
    pro_point_re: str = "# PRO\w+-POINT",
    pro_counter_re: str = "# PRO\w+-COUNTER",
    con_point_re: str = "# CON\w+-POINT",
    con_counter_re: str = "# CON\w+-COUNTER"
    ) -> {}:
    
    # try to open file from path
    try:
        with open(f'../arguana-counterargs-corpus/02-extracted-arguments/training/{category.value}/{file_path}/full.txt', 'r') as file:
            file_contents = file.read()
    except FileNotFoundError:
        print(f"File not found: {file_path + '.txt'}")
        return None
        
    # parse file contents
    lines: [] = re.split(r'\n', file_contents)

    # Enum for argument section
    class ArgumentSection(Enum):
        PRO = "pro"
        CON = "con"
        
    # Enum for argument type
    class ArgumentType(Enum):
        POINT = "point"
        COUNTER = "counter"
        
    # holds the extracted arguments for the debate topic
    debate_arguments = {}
    
    # holds the argument pairs data for the debate topic
    arguments = {
        'pro': [],
        'con': []
    }
    
    # Start looping through lines
    current_argument: str = ""
    start: bool = False
    current_argument_section = ArgumentSection.PRO
    current_argument_type = ArgumentType.POINT
    cur_pair = {}

    for line in lines:
        # skip to start line
        if (not start):
            if re.match(r'\s*' + start_re, line):
                start = True
                continue
            continue
        
        # special case when we reach # LITERATURE we append the last argument and return
        if re.match(r'\s*' + end_re, line):
            _append_argument_to_cur_pair(current_argument, current_argument_type, cur_pair)
            _append_cur_pair_to_arguments(current_argument_section, arguments, cur_pair)
            if len(arguments['pro']) and len(arguments['con']):
                debate_arguments[file_path] = arguments
            return debate_arguments

        # skip citations
        if re.match(r'\s*\[', line):
            continue 

        # Append an argument to current pair
        def _append_argument_to_cur_pair(current_argument: str, current_argument_type: ArgumentType, cur_pair: {}):
            if len(current_argument):
                if current_argument_type == ArgumentType.POINT:
                    cur_pair['point'] = current_argument
                else:
                    cur_pair['counter'] = current_argument

        # Append current argument pair to arguments
        def _append_cur_pair_to_arguments(current_argument_section: ArgumentSection, arguments: {}, cur_pair: {}):
            if len(cur_pair):
                if current_argument_section == ArgumentSection.PRO:
                    arguments["pro"].append(cur_pair)
                else:
                    arguments["con"].append(cur_pair)

        # case where we meet a pro point
        if re.match(r'\s*' + pro_point_re, line):
            _append_argument_to_cur_pair(current_argument, current_argument_type, cur_pair)
            _append_cur_pair_to_arguments(current_argument_section, arguments, cur_pair)
            current_argument_section = ArgumentSection.PRO
            current_argument_type = ArgumentType.POINT
            current_argument = ""
            cur_pair = {}
            continue

        # case where we meet a pro counter
        elif re.match(r'\s*' + pro_counter_re, line):
            _append_argument_to_cur_pair(current_argument, current_argument_type, cur_pair)
            current_argument_section = ArgumentSection.PRO
            current_argument_type = ArgumentType.COUNTER
            current_argument = ""
            continue

         # case where we meet a con point
        elif re.match(r'\s*' + con_point_re, line):
            _append_argument_to_cur_pair(current_argument, current_argument_type, cur_pair)
            _append_cur_pair_to_arguments(current_argument_section, arguments, cur_pair)
            current_argument_section = ArgumentSection.CON
            current_argument_type = ArgumentType.POINT
            current_argument = ""
            cur_pair = {}
            continue
            
        # case where we meet a con counter
        elif re.match(r'\s*' + con_counter_re, line):
            _append_argument_to_cur_pair(current_argument, current_argument_type, cur_pair)
            current_argument_section = ArgumentSection.CON
            current_argument_type = ArgumentType.COUNTER
            current_argument = ""
            continue
        
        # remove in-text citations
        line = re.sub(r'\[\w+\]', '', line)
        line = re.sub(r'\s\s+', '', line)
        current_argument += line.strip()
        
    # this should never actually be reached
    debate_arguments[file_path] = arguments
    return debate_arguments

SyntaxError: invalid syntax (1984694253.py, line 2)

#### [Category] Arguments dict

In [None]:
""" Extract all debates from a category: list_of_<category_path>_debates.txt -> <debate_topic>.txt """

def category_extract_arguments(category: Category) -> {}:
    # convert category.value to path syntax
    category_path = category.value.replace('-', '_')
    
    # try to open file from path
    try:
        with open(f'./file_paths/list_of_{category_path}_debates.txt', 'r') as file:
            file_contents = file.read()
    except FileNotFoundError:
        print(f"File not found: {f'list_of_{category_path}_debates.txt'}")
        return None
        
    # parse file contents
    debates: [] = re.split(r'\n', file_contents)
    
    # grab arguments for each debate in the category
    category_arguments = {}
    for i, debate in enumerate(debates):
        # add topic and arguments to category_arguments
        debate_arguments = debate_extract_arguments(category, debate)
        if debate_arguments:
            category_arguments.update(debate_extract_arguments(category, debate))
        else:
            _write_invalid_debate_to_file(category, debate)
    category_arguments = {f'{category.value}': category_arguments}
    return category_arguments

In [None]:
def _write_invalid_debate_to_file(category: Category, file_path: str):
    output_folder = f'../data_dump/data_valid_tally/'
    output_file_path = f'{output_folder}{category.value}.txt'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    file = open(output_file_path, "a")
    file.write(file_path)

#### [Global] Arguments dict

In [None]:
""" Extract all debates across all categories: all_categories.txt -> list_of_<category>_debates.txt """

def global_extract_arguments() -> {}:
    # open global file from path
    with open('./file_paths/all_categories.txt', 'r') as global_file:
        global_file_contents = global_file.read()
        
    # parse file contents
    category_pattern = re.compile(r'list_of_(\w+)_debates')
    lines: [] = re.split(r'\n', global_file_contents)
    category_paths = [line for line in lines if category_pattern.search(line)]
    category_names = [category_pattern.search(category).group(1).upper() for category in category_paths]
    
    # key: category: Category.value
    # value: dictionary of dictionaries where key = topic and value is {'pro: [{'point':, 'counter':}, ...], 'con': []}
    global_arguments = {} 
    # add valid topics as keys to extracted_categories and grab their arguments
    for index, category_str in enumerate(zip(category_paths, category_names)):
        try:
            category = Category[category_str[1]]
            global_arguments.update(category_extract_arguments(category))
        except KeyError as e:
            print(f"Category: {category_str[1]}, Category not found in Category enum and is removed.")
            category_paths.pop(index)
            category_names.pop(index)
    return global_arguments

## Convert to df

#### [Write to File] Arguments df

In [None]:
""" Write arguments df to pickle file """

def _arguments_df_write_to_file(
        arguments_data: pd.DataFrame,
        category: Optional[str] = None,
        topic: Optional[str] = None
    ):

    # Debate case
    if topic and category:
        topic_path = topic.replace('-', '_')
        output_folder = f'../data_dump/arguments_dump/{category}/'
        output_file_path = f'{output_folder}{topic_path}_arguments.pkl'
    
    # Category case
    elif category:
        output_folder = f'../data_dump/arguments_dump/{category}/'
        output_file_path = f'{output_folder}{category}_arguments.pkl'
    
    # Global case
    else:
        output_folder = f'../data_dump/arguments_dump/'
        output_file_path = f'{output_folder}global_arguments.pkl'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    arguments_data.to_pickle(output_file_path)

#### [Debate] Arguments df

In [None]:
""" Convert arguments dict into df """

def debate_convert_to_df(debate_arguments: {}, category: str) -> pd.DataFrame:
    debate_arguments_df = pd.DataFrame()
    debate_topic = next(iter(debate_arguments))

    # loop through all argument pairs in the # PRO section
    for i, pro_argument in enumerate(debate_arguments[debate_topic]["pro"]):
        point_argument = {
            'argument': pro_argument['point'],
            'pair_id': str(i),
            'type': 'point',
            'stance': 'PRO'
        }
        debate_arguments_df = pd.concat([debate_arguments_df, pd.DataFrame([point_argument])], axis=0)
        debate_arguments_df = debate_arguments_df.reset_index(drop=True)

        if 'counter' in pro_argument.keys():
            counter_argument = {
                'argument': pro_argument['counter'],
                'pair_id': str(i),
                'type': 'counter',
                'stance': 'CON'
            }
            debate_arguments_df = pd.concat([debate_arguments_df, pd.DataFrame([counter_argument])], axis=0)
            debate_arguments_df = debate_arguments_df.reset_index(drop=True)
    
    offset = len(debate_arguments[debate_topic]["pro"])
    
    # loop through all argument pairs in the # CON section
    for j, con_argument in enumerate(debate_arguments[debate_topic]["con"]):
        point_argument = {
            'argument': con_argument['point'],
            'pair_id': str(j+offset),
            'type': 'point',
            'stance': 'CON'
        }
        debate_arguments_df = pd.concat([debate_arguments_df, pd.DataFrame([point_argument])], axis=0)
        debate_arguments_df = debate_arguments_df.reset_index(drop=True)

        if 'counter' in con_argument.keys():
            counter_argument = {
                'argument': con_argument['counter'],
                'pair_id': str(j+offset),
                'type': 'counter',
                'stance': 'PRO'
            }
            debate_arguments_df = pd.concat([debate_arguments_df, pd.DataFrame([counter_argument])], axis=0)
            debate_arguments_df = debate_arguments_df.reset_index(drop=True)
            
    debate_arguments_df['topic'] = debate_topic
    debate_arguments_df = debate_arguments_df.dropna()
    _arguments_df_write_to_file(debate_arguments_df, category, debate_topic)
    return debate_arguments_df

#### [Category] Arguments df

In [None]:
""" Convert category arguments dict into df """

def category_convert_to_df(category_arguments: {}) -> pd.DataFrame:
    category_arguments_df = pd.DataFrame()
    category = next(iter(category_arguments))

    # Loop through debates in category
    debates = category_arguments[category]
    for debate in debates:
        debate_dict = category_arguments[category][debate]
        debate_df = debate_convert_to_df({debate: debate_dict}, category)
        category_arguments_df = pd.concat([category_arguments_df, debate_df], axis = 0)
        category_arguments_df = category_arguments_df.reset_index(drop=True)

    category_arguments_df['category'] = category
    category_arguments_df = category_arguments_df.dropna()
    _arguments_df_write_to_file(category_arguments_df, category)
    return category_arguments_df

#### [Global] Arguments df

In [None]:
""" Convert global arguments dict into df """

def global_convert_to_df(global_arguments: {}) -> pd.DataFrame:
    global_arguments_df = pd.DataFrame()

    # Loop through categories in global arguments
    for category in global_arguments.keys():
        global_arguments_df = pd.concat([global_arguments_df, category_convert_to_df({category: global_arguments[category]})], axis=0)
        global_arguments_df = global_arguments_df.reset_index(drop=True)

    global_arguments_df = global_arguments_df.dropna()
    _arguments_df_write_to_file(global_arguments_df)
    return global_arguments_df

## Get Embeddings

#### [Imports] Get Embeddings

In [None]:
pip install tenacity

In [None]:
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential
)

#### [Write] Embeddings df

In [None]:
""" Write extracted embeddings to pickle file """

def _embeddings_write_to_file(
    embeddings_data: pd.DataFrame,
    category: Optional[str] = None,
    topic: Optional[str] = None
    ):

    # Debate case
    if topic and category:
        topic_path = topic.replace('-', '_')
        output_folder = f'../data_dump/embeddings_dump/{category}/'
        output_file_path = f'{output_folder}{topic_path}_embeddings.pkl'
    
    # Category case
    elif category:
        output_folder = f'../data_dump/embeddings_dump/{category}/'
        output_file_path = f'{output_folder}{category}_embeddings.pkl'
    
    # Global case
    else:
        output_folder = f'../data_dump/embeddings_dump/'
        output_file_path = f'{output_folder}global_embeddings.pkl'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    embeddings_data.to_pickle(output_file_path)

#### [All] Embeddings df

In [None]:
""" Convert an argument into a (1 x 1536) embedding df """

DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def _get_embeddings(arguments: []) -> []:
    embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
    embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
    embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    return embeddings_df.reset_index(drop=True)

In [None]:
""" Add embeddings column to a df """

API_LIMIT = 1000

def get_embeddings_df(arguments_df: pd.DataFrame, processing_unit: ProcessingUnit, debate_category: Optional[Category] = None) -> pd.DataFrame:
    embeddings_df = pd.DataFrame()
    arguments_list = list(arguments_df['argument'])
    total_len = len(arguments_list)
    i = 0

    # Grab embeddings from arguments column in chunks
    while i < total_len:
        embeddings = _get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
        embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
        i = i + API_LIMIT
    arguments_embeddings_df = pd.concat([arguments_df, embeddings_df], axis=1)
    
    # Write embeddings df to file
    if processing_unit == ProcessingUnit.GLOBAL:
        _embeddings_write_to_file(arguments_embeddings_df, None, None)
    elif processing_unit == ProcessingUnit.CATEGORY:
        _embeddings_write_to_file(arguments_embeddings_df, arguments_embeddings_df['category'].iloc[0], None)
    elif processing_unit == ProcessingUnit.DEBATE:
        _embeddings_write_to_file(arguments_embeddings_df, debate_category.value, arguments_embeddings_df['topic'].iloc[0])
    return arguments_embeddings_df

## Analyze Embeddings

#### [Imports] Analysis df

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler

#### [Write] Analysis df

In [None]:
""" Write analysis results to pickle file """

def _analysis_write_to_file(
    analysis_type: AnalysisType,
    processing_unit: ProcessingUnit, # Does the df contain 1 debate / 1 category / global
    analysis_data: pd.DataFrame,
    category: Optional[str] = None,
    topic: Optional[str] = None
    ):
    processing_level = processing_unit.value
    
    # Debate facet
    if topic and category:
        topic_path = topic.replace('-', '_')
        if processing_unit == ProcessingUnit.DEBATE:
            output_folder = f'../data_dump/{analysis_type.value}_dump/{category}/debates/'
        elif processing_unit == ProcessingUnit.CATEGORY:
            output_folder = f'../data_dump/{analysis_type.value}_dump/{category}/category-facet-debates/'
        elif processing_unit == ProcessingUnit.GLOBAL:
            output_folder = f'../data_dump/{analysis_type.value}_dump/{category}/global-facet-debates/'
        output_file_path = f'{output_folder}{topic_path}_{analysis_type.value}.pkl'
    
    # Category facet
    elif category:
        output_folder = f'../data_dump/{analysis_type.value}_dump/{category}/'
        if processing_unit == ProcessingUnit.CATEGORY:
            output_file_path = f'{output_folder}{category}_{analysis_type.value}.pkl'
        elif processing_unit == ProcessingUnit.GLOBAL:
            output_file_path = f'{output_folder}global_category_facet_{category}_{analysis_type.value}.pkl'
            
    # Global facet
    else:
        if processing_unit == ProcessingUnit.GLOBAL:
            output_folder = f'../data_dump/{analysis_type.value}_dump/'
            output_file_path = f'{output_folder}global_{analysis_type.value}.pkl'
        else:
            print(f"Invalid processing unit: {processing_unit}.")
   
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    analysis_data.to_pickle(output_file_path)

#### [Debate] Analysis df

In [None]:
def pca_normalization(pair_df: pd.DataFrame) -> pd.DataFrame:
    """ Normalize PCA argument embeddings from a df
    """
    ret_df = pd.DataFrame()
    
    point_row = pair_df[pair_df['type'] == 'point']
    counter_row = pair_df[pair_df['type'] == 'counter']
    
    point_nonnum_row = point_row.select_dtypes(exclude=[np.number])
    counter_nonnum_row = counter_row.select_dtypes(exclude=[np.number])
    new_nonnum_rows = pd.concat([point_nonnum_row, counter_nonnum_row])
    new_nonnum_rows = new_nonnum_rows.reset_index(drop=True)
    
    point_vec = point_row.select_dtypes(include=[np.number]).values.flatten()
    counter_vec = counter_row.select_dtypes(include=[np.number]).values.flatten()

    center = (point_vec.copy() + counter_vec.copy()) / 2
    point_vec -= center
    counter_vec -= center
    point_vec = point_vec.flatten()
    counter_vec = counter_vec.flatten()

    point_new_num_row = pd.DataFrame(point_vec.reshape(1,-1), columns=['{}'.format(i) for i in range(len(point_vec))])
    counter_new_num_row = pd.DataFrame(counter_vec.reshape(1,-1), columns=['{}'.format(i) for i in range(len(counter_vec))])
    new_num_rows = pd.concat([point_new_num_row, counter_new_num_row])
    new_num_rows = new_num_rows.reset_index(drop=True)
    return new_nonnum_rows.join(new_num_rows)

In [None]:
def pca_preprocessing(embeddings_df: pd.DataFrame) -> pd.DataFrame:
    if len(embeddings_df) % 2 != 0:
        print("Warning: embeddings_df not in pairs")
    embeddings_ret_df = pd.DataFrame()
    topics = list(embeddings_df['topic'].unique())

    for topic in topics:
        topic_rows = embeddings_df[embeddings_df['topic'] == topic]
        if len(topic_rows) % 2 != 0:
            print(f"Warning: Topic '{topic}' has {len(topic_rows)} rows.")
        pair_ids = list(topic_rows['pair_id'].unique())
        
        for pair_id in pair_ids:
            pair_df = topic_rows[topic_rows['pair_id'] == pair_id]
            if len(pair_df) != 2:
                print(f"Warning: Pair {pair_id} at topic '{topic}' has {len(pair_df)} rows.")
                continue
            embeddings_ret_df = pd.concat([embeddings_ret_df, pca_normalization(pair_df)])
            embeddings_ret_df = embeddings_ret_df.reset_index(drop=True)
    return embeddings_ret_df

In [None]:
def pca_embeddings(
        embeddings_df: pd.DataFrame,
        num_components: int,
        processing_unit: ProcessingUnit=None,
        facet: ProcessingUnit=None,
        debate_category: Optional[str] = None
    ):
    embeddings_processed = pca_preprocessing(embeddings_df)
    numeric_columns = embeddings_processed.select_dtypes(include=[np.number]).columns
    non_numeric_columns = embeddings_processed.select_dtypes(exclude=[np.number]).columns
    embeddings_data = embeddings_processed[numeric_columns].values
    
    # scaler = StandardScaler()
    # embedding_vectors_scaled = scaler.fit_transform(embeddings_data)
    num_components = min(num_components, embeddings_data.shape[0], embeddings_data.shape[1])
    pca = PCA(n_components=num_components)
    embeddings_pca = pca.fit_transform(embeddings_data)
    embeddings_pca_data = (
        pd.DataFrame(embeddings_pca, columns=['pca_{}'.format(i) for i in range(num_components)])
        .join(embeddings_processed[non_numeric_columns].reset_index(drop=True))
    )

    # Printing PCA attributes
    components = pca.components_
    explained_variance = pca.explained_variance_
    explained_variance_ratio = pca.explained_variance_ratio_
    singular_values = pca.singular_values_
    mean_value = pca.mean_
    n_components = pca.n_components_
    n_features = pca.n_features_in_
    n_samples = pca.n_samples_
    
    # Displaying the attributes
    print("Components:")
    print(components)
    print("\nExplained Variance:")
    print(explained_variance)
    print("\nExplained Variance Ratio:")
    print(explained_variance_ratio)
    print("\nSingular Values:")
    print(singular_values)
    print("\nMean:")
    print(mean_value)
    print("\nNumber of Components:")
    print(n_components)
    print("\nNumber of Features:")
    print(n_features)
    print("\nNumber of Samples:")
    print(n_samples)

    # Write to file
    if facet == ProcessingUnit.DEBATE:
        _analysis_write_to_file(AnalysisType.PCA, processing_unit, embeddings_pca_data, debate_category, embeddings_pca_data['topic'].iloc[0])
    elif facet == ProcessingUnit.CATEGORY:
        _analysis_write_to_file(AnalysisType.PCA, processing_unit, embeddings_pca_data, embeddings_pca_data['category'].iloc[0])
    elif facet == ProcessingUnit.GLOBAL:
        _analysis_write_to_file(AnalysisType.PCA, processing_unit, embeddings_pca_data)
    else:
        pass
    return embeddings_pca_data

In [None]:
def debate_analyze_embeddings(
        analysis_type: AnalysisType,
        num_components,
        debate_embeddings_df: pd.DataFrame,
        debate_category: str
    ):
    """ Analyze argument embeddings from a debate df
    """
    if analysis_type == AnalysisType.TSNE:
        debate_embeddings_analysis = tsne_embeddings(debate_embeddings_df, ProcessingUnit.DEBATE, ProcessingUnit.DEBATE, debate_category)
    elif analysis_type == AnalysisType.PCA:
        debate_embeddings_analysis = pca_embeddings(debate_embeddings_df, num_components, ProcessingUnit.DEBATE, ProcessingUnit.DEBATE, debate_category)
    return debate_embeddings_analysis

#### [Category] Analysis df

In [None]:
def category_analyze_embeddings(
        analysis_type: AnalysisType,
        num_components,
        category_embeddings_df: pd.DataFrame,
        facet: ProcessingUnit
    ):
    """ Analyze argument embeddings from a category df
    """
    # Analyze Embeddings
    category = category_embeddings_df['category'].iloc[0]
    if analysis_type == AnalysisType.TSNE:
        if facet == ProcessingUnit.CATEGORY:
            category_embeddings_analysis = tsne_embeddings(category_embeddings_df, ProcessingUnit.CATEGORY, facet)
        elif facet == ProcessingUnit.DEBATE:
            category_embeddings_analysis = category_embeddings_df.groupby('topic').apply(lambda group: tsne_embeddings(group, ProcessingUnit.CATEGORY, facet, category))
            category_embeddings_analysis = category_embeddings_analysis.reset_index(drop=True)
        else:
            print(f"Inappropriate facet level: {facet}.")
    elif analysis_type == AnalysisType.PCA:
        if facet == ProcessingUnit.CATEGORY:
            category_embeddings_analysis = pca_embeddings(category_embeddings_df, num_components, ProcessingUnit.CATEGORY, facet)
        elif facet == ProcessingUnit.DEBATE:
            category_embeddings_analysis = category_embeddings_df.groupby('topic').apply(lambda group: pca_embeddings(group, num_components, ProcessingUnit.CATEGORY, facet, category))
            category_embeddings_analysis = category_embeddings_analysis.reset_index(drop=True)
        else:
            print(f"Inappropriate facet level: {facet}.")
    
    # Write to file for facet
    if facet == ProcessingUnit.DEBATE:
        output_folder = f'../data_dump/{analysis_type.value}_dump/{category}/'
        output_file_path = f'{output_folder}category_debate_facet_{category}_{analysis_type.value}.pkl'
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        category_embeddings_analysis.to_pickle(output_file_path)
    return category_embeddings_analysis

#### [Global] Analysis df

In [None]:
def global_analyze_embeddings(
    analysis_type: AnalysisType,
    num_components: int,
    global_embeddings_df: pd.DataFrame,
    facet: ProcessingUnit
    ):
    """ Analyze argument embeddings from a global df
    """
    if analysis_type == AnalysisType.TSNE:
        if facet == ProcessingUnit.GLOBAL:
            global_embeddings_analysis = tsne_embeddings(global_embeddings_df, ProcessingUnit.GLOBAL, facet)
        elif facet == ProcessingUnit.CATEGORY:
            global_embeddings_analysis = global_embeddings_df.groupby('category').apply(lambda group: tsne_embeddings(group, ProcessingUnit.GLOBAL, facet))
        elif facet == ProcessingUnit.DEBATE:
            global_embeddings_analysis = global_embeddings_df.groupby('topic').apply(lambda group: tsne_embeddings(group, ProcessingUnit.GLOBAL, facet, group['category'].iloc[0]))
    elif analysis_type == AnalysisType.PCA:
        if facet == ProcessingUnit.GLOBAL:
            global_embeddings_analysis = pca_embeddings(global_embeddings_df, num_components, ProcessingUnit.GLOBAL, facet)
        elif facet == ProcessingUnit.CATEGORY:
            global_embeddings_analysis = global_embeddings_df.groupby('category').apply(lambda group: pca_embeddings(group, num_components, ProcessingUnit.GLOBAL, facet))
        elif facet == ProcessingUnit.DEBATE:
            global_embeddings_analysis = global_embeddings_df.groupby('topic').apply(lambda group: pca_embeddings(group, num_components, ProcessingUnit.GLOBAL, facet, group['category'].iloc[0]))
    
    # Write to file for facet
    if facet != ProcessingUnit.GLOBAL:
        output_folder = f'../data_dump/{analysis_type.value}_dump/'
        output_file_path = f'{output_folder}global_{facet.value}_facet_{analysis_type.value}.pkl'
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        global_embeddings_analysis.to_pickle(output_file_path)
    return global_embeddings_analysis

## Plot Embeddings

#### [Imports] Analysis Plot

In [None]:
from plotnine import ggplot, geom_point, geom_text, geom_line, aes, theme, theme_void, labs, element_text, facet_wrap, ggsave

#### [Debate] Analysis Plot

In [None]:
def _insert_line_breaks(text, max_width=50):
    words = text.split(' ')
    lines = []
    current_line = ''

    for word in words:
        if len(current_line) + len(word) <= max_width:
            current_line += word + ' '
        else:
            lines.append(current_line.strip())
            current_line = word + ' '

    lines.append(current_line.strip())
    return '\n'.join(lines)

In [None]:
def debate_plot(
        analysis_type: AnalysisType,
        debate_category: str,
        embeddings_analysis_data: pd.DataFrame,
        processing_unit: ProcessingUnit=ProcessingUnit.DEBATE
    ):
    """ Plot embeddings for a single debate
    """
    # Plot
    stance_markers = {'PRO': '+', 'CON': '*'}
    debate_topic = embeddings_analysis_data['topic'].iloc[0]
    plot_topic = _insert_line_breaks(debate_topic.replace('-', ' '))
    plot_analysis_type = analysis_type.value.upper()
    gg = (
        ggplot(embeddings_analysis_data, aes(x='x', y='y', color='stance', shape='stance', group='pair_id')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Plot for Debate:\n{plot_topic}',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )
    
    # Save to file
    if processing_unit == ProcessingUnit.DEBATE:
        output_folder = f'../data_dump/{analysis_type.value}_plots_dump/{debate_category}/debate-plots/'
    elif processing_unit == ProcessingUnit.CATEGORY:
        output_folder = f'../data_dump/{analysis_type.value}_plots_dump/{debate_category}/debate-plots/category-facet-debate-plots/'
    elif processing_unit == ProcessingUnit.GLOBAL:
        output_folder = f'../data_dump/{analysis_type.value}_plots_dump/{debate_category}/debate-plots/global-facet-debate-plots/'
    output_file_path = f'{output_folder}{debate_topic}_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

#### [Category] Analysis Plot

In [None]:
""" Plot embeddings for debates in a category """

def category_plot(
        analysis_type: AnalysisType,
        category_plot_data: pd.DataFrame,
        processing_unit: ProcessingUnit=ProcessingUnit.CATEGORY,
        facet: ProcessingUnit=ProcessingUnit.CATEGORY,
        view: ProcessingUnit=ProcessingUnit.CATEGORY
    ):
    
    # Plot
    plot_category = category_plot_data['category'].iloc[0]
    plot_analysis_type = analysis_type.value.upper()
    category_plot_data['interaction'] = category_plot_data['pair_id'] + '_' + category_plot_data['topic']
    if view == ProcessingUnit.CATEGORY:
        gg = (
            ggplot(category_plot_data, aes(x='x', y='y', color='topic', shape='stance', group='interaction')) +
            geom_point(size=2) +
            geom_line(color='black', size=0.5) +
            labs(
                title=f'{plot_analysis_type} Plot for Category:\n{plot_category}',
                x=f'{plot_analysis_type}_x',
                y=f'{plot_analysis_type}_y'
            ) +
            theme(
                legend_position="none",
                plot_title=element_text(size=24),
                strip_text=element_text(angle=0, hjust=0.5, vjust=1, wrap=True),
                figure_size=(16, 16)
            )
        )
    elif view == ProcessingUnit.DEBATE:
        gg = (
            ggplot(category_plot_data, aes(x='x', y='y', group='interaction')) +
            facet_wrap('~topic', ncol=5, scales='free') +
            geom_point(aes(color='stance'), size=1) +
            geom_line(color='black', size=0.5) +
            labs(
                title=f'{plot_analysis_type} Plot for Category:\n{plot_category}',
                x=f'{plot_analysis_type}_x',
                y=f'{plot_analysis_type}_y'
            ) +
            theme(
                axis_title=element_text(size=16),
                plot_title=element_text(size=32),
                strip_text=element_text(angle=0, hjust=0.5, vjust=1, wrap=True),
                figure_size=(24, 24)
            )
        )
    else:
        print(f'Inappropriate view level: {facet}')
        
    # Save to file
    output_folder = f'../data_dump/{analysis_type.value}_plots_dump/{plot_category}/{view.value}-view/'
    if processing_unit == ProcessingUnit.CATEGORY:
        if facet == ProcessingUnit.CATEGORY:
            output_file_path = f'{output_folder}{plot_category}_{analysis_type.value}_plot.png'
        elif facet == ProcessingUnit.DEBATE:
            output_file_path = f'{output_folder}category_debate_facet_{plot_category}_{analysis_type.value}_plot.png'
    elif processing_unit == ProcessingUnit.GLOBAL:
        output_file_path = f'{output_folder}global_category_facet_{plot_category}_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

#### [Global] Analysis Plot

In [None]:
# Plot embeddings for all debates
def global_plot(
        analysis_type: AnalysisType,
        global_plot_data: pd.DataFrame,
        facet: ProcessingUnit=ProcessingUnit.GLOBAL,
        view: ProcessingUnit=ProcessingUnit.GLOBAL
    ):

    # Plot
    plot_analysis_type = analysis_type.value.upper()
    global_plot_data['interaction'] = global_plot_data['pair_id'] + '_' + global_plot_data['topic']
    if view == ProcessingUnit.GLOBAL:
        gg = (
            ggplot(global_plot_data, aes(x='x', y='y', color='category', shape='stance', group='interaction')) +
            geom_point(size=2) +
            geom_line(color='black', size=0.5) +
            labs(
                title=f'{plot_analysis_type} Plot for All Debates',
                x=f'{plot_analysis_type}_x',
                y=f'{plot_analysis_type}_y'
            ) +
            theme(
                legend_position="none",
                plot_title=element_text(size=32),
                strip_text=element_text(angle=0, hjust=0.5, vjust=1, wrap=True),
                figure_size=(24, 24)
            )
        )
    elif view == ProcessingUnit.CATEGORY:
        gg = (
            ggplot(global_plot_data.reset_index(drop=True), aes(x='x', y='y', group='interaction')) +
            facet_wrap('~category', ncol=2, scales='free') +
            geom_point(aes(color='topic'), size=1) +
            geom_line(color='black', size=0.5) +
            labs(
                title=f'{plot_analysis_type} Plot for All Categories',
                x=f'{plot_analysis_type}_x',
                y=f'{plot_analysis_type}_y'
            ) +
            theme(
                axis_title=element_text(size=16),
                plot_title=element_text(size=32),
                strip_text=element_text(angle=0, hjust=0.5, vjust=1, wrap=True),
                legend_position='none',
                figure_size=(24, 24)
            )
        )
    elif view == ProcessingUnit.DEBATE:
        print('But why? This is very very not recommended.')
        gg = (
            ggplot(global_plot_data.reset_index(drop=True), aes(x='x', y='y', group='interaction')) +
            facet_wrap('~category', scales='free') +
            facet_wrap('~topic', ncol=5, scales='free') +
            geom_point(aes(color='stance'), size=1) +
            geom_line(color='black', size=0.5) +
            labs(
                title=f'{plot_analysis_type} Plot for All Debates Across All Categories',
                x=f'{plot_analysis_type}_x',
                y=f'{plot_analysis_type}_y'
            ) +
            theme(
                axis_title=element_text(size=16),
                plot_title=element_text(size=32),
                strip_text=element_text(angle=0, hjust=0.5, vjust=1, wrap=True),
                legend_position='none',
                figure_size=(24, 24)
            )
        )

    # Save to file
    output_folder = f'../data_dump/{analysis_type.value}_plots_dump/{view.value}-view/'
    if facet == ProcessingUnit.GLOBAL:
        output_file_path = f'{output_folder}global_{analysis_type.value}_plot.png'
    elif facet == ProcessingUnit.CATEGORY:
        output_file_path = f'{output_folder}global_category_facet_{analysis_type.value}_plot.png'
    elif facet == ProcessingUnit.DEBATE:
        output_file_path = f'{output_folder}global_debate_facet_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

## Run

#### [Run] Extract Arguments

In [None]:
# Run debate level extract arguments
economy_debate_arguments = debate_extract_arguments(Category.ECONOMY, "business-economy-general-house-would-prohibit-retailers-selling-certain-items")
economy_debate_arguments

In [None]:
# Run category level extract arguments
economy_category_arguments = category_extract_arguments(Category.ECONOMY)
economy_category_arguments

In [None]:
# Run global level extract arguments
global_arguments = global_extract_arguments()
global_arguments

#### [Run] Convert to DataFrame

In [None]:
# Run debate level convert to dataframe
economy_debate_arguments_df = debate_convert_to_df(economy_debate_arguments, Category.ECONOMY.value)
economy_debate_arguments_df

In [None]:
# Run category level convert to dataframe
economy_category_arguments_df = category_convert_to_df(economy_category_arguments)
economy_category_arguments_df

In [None]:
# Run global level convert to dataframe
global_arguments_df = global_convert_to_df(global_arguments)
global_arguments_df

#### [Load] Arguments df

In [None]:
arguments_data_path = "../data_dump/arguments_dump/"
loaded_economy_debate_arguments_df = pd.read_pickle(f"{arguments_data_path}economy/business_economy_general_house_would_prohibit_retailers_selling_certain_items_arguments.pkl")
loaded_economy_category_arguments_df = pd.read_pickle(f"{arguments_data_path}economy/economy_arguments.pkl")
loaded_global_arguments_df = pd.read_pickle(f"{arguments_data_path}global_arguments.pkl")

#### [Run] Get Embeddings

In [None]:
from IPython.display import display 

In [None]:
# Run debate level get embeddings
economy_debate_embeddings_df = get_embeddings_df(loaded_economy_debate_arguments_df, ProcessingUnit.DEBATE, Category.ECONOMY)
economy_debate_embeddings_df

In [None]:
# Run category level get embeddings
loaded_economy_category_arguments_df = pd.read_pickle(f"{arguments_data_path}economy/economy_arguments.pkl")
economy_category_embeddings_df = get_embeddings_df(loaded_economy_category_arguments_df, ProcessingUnit.CATEGORY)
economy_category_embeddings_df

In [None]:
# Run global level get embeddings
global_embeddings_df = get_embeddings_df(loaded_global_arguments_df, ProcessingUnit.GLOBAL)
global_embeddings_df

#### [Load] Embeddings df

In [None]:
embeddings_data_path = "../data_dump/embeddings_dump/"
loaded_economy_debate_embeddings_df = pd.read_pickle(f"{embeddings_data_path}economy/business_economy_general_house_would_prohibit_retailers_selling_certain_items_embeddings.pkl")
loaded_economy_category_embeddings_df = pd.read_pickle(f"{embeddings_data_path}economy/economy_embeddings.pkl")
loaded_global_embeddings_df = pd.read_pickle(f"{embeddings_data_path}global_embeddings.pkl")

In [None]:
loaded_global_embeddings_df2 = loaded_global_embeddings_df.copy()

In [None]:
pair_rows

In [None]:
matrix = pd.DataFrame()
for group_key, rows in loaded_global_embeddings_df2:
    matrix = pd.concat([matrix, rows])

In [None]:
matrix

In [None]:
processed = pca_preprocessing(loaded_global_embeddings_df)

In [None]:
loaded_global_embeddings_df2.groups.items()

#### [Run]  Analyze Embeddings

In [None]:
# Run debate level pca embeddings
economy_debate_pca_embeddings = debate_analyze_embeddings(AnalysisType.PCA, 10, loaded_economy_debate_embeddings_df, "economy")
economy_debate_pca_embeddings

In [None]:
# Run category level category facet pca embeddings
economy_category_pca_category_facet_embeddings = category_analyze_embeddings(AnalysisType.PCA, 10, loaded_economy_category_embeddings_df, ProcessingUnit.CATEGORY)
economy_category_pca_category_facet_embeddings

In [None]:
# Run category level debate facet pca embeddings
economy_category_pca_debate_facet_embeddings = category_analyze_embeddings(AnalysisType.PCA, 10, loaded_economy_category_embeddings_df, ProcessingUnit.DEBATE)
economy_category_pca_debate_facet_embeddings

In [None]:
# Run global level global facet pca embeddings
global_pca_global_facet_embeddings = global_analyze_embeddings(AnalysisType.PCA, 10, loaded_global_embeddings_df, ProcessingUnit.GLOBAL)
global_pca_global_facet_embeddings

In [None]:
# Run global level category facet pca embeddings
global_pca_category_facet_embeddings = global_analyze_embeddings(AnalysisType.PCA, 10, loaded_global_embeddings_df, ProcessingUnit.CATEGORY)
global_pca_category_facet_embeddings

In [None]:
# Run global level debate facet pca embeddings
global_pca_debate_facet_embeddings = global_analyze_embeddings(AnalysisType.PCA, 10, loaded_global_embeddings_df, ProcessingUnit.DEBATE)
global_pca_debate_facet_embeddings

#### [Load] Analysis df

In [None]:
""" pca """

pca_data_path = "../data_dump/pca_dump/"

# Debate level
loaded_economy_debate_pca_df = pd.read_pickle(f"{pca_data_path}economy/debates/business_economy_general_house_would_prohibit_retailers_selling_certain_items_pca.pkl")
loaded_category_debate_facet_economy_debate_pca_df = pd.read_pickle(f"{pca_data_path}economy/category-facet-debates/business_economy_general_house_would_prohibit_retailers_selling_certain_items_pca.pkl")
loaded_global_debate_facet_economy_debate_pca_df = pd.read_pickle(f"{pca_data_path}economy/global-facet-debates/business_economy_general_house_would_prohibit_retailers_selling_certain_items_pca.pkl")

# Category level
loaded_economy_category_pca_df = pd.read_pickle(f"{pca_data_path}economy/economy_pca.pkl")
loaded_category_debate_facet_economy_category_pca_df = pd.read_pickle(f"{pca_data_path}economy/category_debate_facet_economy_pca.pkl")
loaded_global_category_facet_economy_category_pca_df = pd.read_pickle(f"{pca_data_path}economy/global_category_facet_economy_pca.pkl")

# Global level
loaded_global_pca_df = pd.read_pickle(f"{pca_data_path}global_pca.pkl")
loaded_global_category_facet_pca_df = pd.read_pickle(f"{pca_data_path}global_category_facet_pca.pkl")
loaded_global_debate_facet_pca_df = pd.read_pickle(f"{pca_data_path}global_debate_facet_pca.pkl")

In [None]:
loaded_global_debate_facet_pca_df

#### [Run] Debate Level PCA Plots

In [None]:
# Run debate level pca plot
debate_plot(AnalysisType.PCA, "economy", loaded_economy_debate_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Run debate level category facet pca plot
debate_plot(AnalysisType.PCA, "economy", loaded_category_debate_facet_economy_debate_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Run debate level global facet pca plot
debate_plot(AnalysisType.PCA, "economy", loaded_global_debate_facet_economy_debate_pca_df, ProcessingUnit.GLOBAL)

#### [Run] Category Level PCA Plots -> Category View

In [None]:
# Run category level category view pca plot
category_plot(AnalysisType.PCA, loaded_economy_category_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.CATEGORY, ProcessingUnit.CATEGORY)

In [None]:
# Run category level category debate facet category view pca plot
category_plot(AnalysisType.PCA, loaded_category_debate_facet_economy_category_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.DEBATE, ProcessingUnit.CATEGORY)

In [None]:
# Run category level global category facet category view pca plot
category_plot(AnalysisType.PCA, loaded_global_category_facet_economy_category_pca_df, ProcessingUnit.GLOBAL, ProcessingUnit.CATEGORY, ProcessingUnit.CATEGORY)

#### [Run] Category Level PCA Plots -> Debate View

In [None]:
# Run category level debates view pca plot
category_plot(AnalysisType.PCA, loaded_economy_category_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.CATEGORY, ProcessingUnit.DEBATE)

In [None]:
# Run category level category debate facet debates view pca plot
category_plot(AnalysisType.PCA, loaded_category_debate_facet_economy_category_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.DEBATE, ProcessingUnit.DEBATE)

In [None]:
# Run category level global category facet debates view pca plot
category_plot(AnalysisType.PCA, loaded_global_category_facet_economy_category_pca_df, ProcessingUnit.GLOBAL, ProcessingUnit.CATEGORY, ProcessingUnit.DEBATE)

#### [Run] Global Level PCA Plots -> Global View

In [None]:
# Run global level global view pca plot
global_plot(AnalysisType.PCA, loaded_global_pca_df, ProcessingUnit.GLOBAL, ProcessingUnit.GLOBAL)

In [None]:
# Run global level global category facet global view pca plot
global_plot(AnalysisType.PCA, loaded_global_category_facet_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.GLOBAL)

In [None]:
# Run global level global debate facet global view pca plot
global_plot(AnalysisType.PCA, loaded_global_debate_facet_pca_df, ProcessingUnit.DEBATE, ProcessingUnit.GLOBAL)

#### [Run] Global Level PCA Plots -> Category View

In [None]:
# Run global level category view pca plot
global_plot(AnalysisType.PCA, loaded_global_pca_df, ProcessingUnit.GLOBAL, ProcessingUnit.CATEGORY)

In [None]:
# Run global level global category facet category view pca plot
global_plot(AnalysisType.PCA, loaded_global_category_facet_pca_df, ProcessingUnit.CATEGORY, ProcessingUnit.CATEGORY)

In [None]:
# Run global level global debate facet category view pca plot
global_plot(AnalysisType.PCA, loaded_global_debate_facet_pca_df, ProcessingUnit.DEBATE, ProcessingUnit.CATEGORY)

## Load Plots

In [None]:
# Insert file path 