In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

# Analyze Corpus

Let's first look at the number of words spoken by male and female characters in calderon's comedias

In [None]:
character_file = 'calderon-gender-prediction/all_characters.csv'
character_df = pd.read_csv(character_file, usecols = ['id','genre','character_gender','character_id', 'scenes', 'utterances', 'tokens', 'words_spoken'])

#only examine comedias files, not autos, loas, or zarzuelas
comedias_df = character_df[(character_df['genre'] != 'auto sacramental') & 
                          (character_df['genre'] != 'loa') & 
                          (character_df['genre'] != 'zarzuela') & 
                          (character_df['genre'] != 'mojiganga')]

#drop nan values

comedias_df = comedias_df.dropna()
comedias_df = comedias_df[comedias_df['words_spoken'] > 30]
print(comedias_df.shape)
print(comedias_df['genre'].value_counts())

In [None]:
print(comedias_df['id'].unique().shape)

In [None]:
#chart the number of words spoken by male and female characters
for i in comedias_df['character_gender'].unique():
    filtered_data = comedias_df[comedias_df['character_gender'] == i]
    plt.hist(filtered_data['words_spoken'], label=i)
    plt.title('Number of Words Spoken by Character Gender')
    plt.xlabel('Number of Tokens Spoken')
    plt.ylabel('Number of Characters')
    plt.legend()
plt.show()

Print the total number of words spoken by men and by women

In [None]:
#print the sum of the words spoken coulmn for each 
print(comedias_df.groupby('character_gender')['words_spoken'].sum())

#print how many characters
print(comedias_df.groupby('character_gender')['character_id'].nunique()) #but some names are repeated

In [None]:
#how many characters in the corpus speak more than 30 words
print(comedias_df[comedias_df['words_spoken'] > 30].shape)

#count number of rows in the dataframe with male or female and > 30 words spoken
print(comedias_df[comedias_df['words_spoken'] > 30]['character_gender'].value_counts())

Let's get the max, min, and mean of the number of words spoken by gender

In [None]:
for i in comedias_df['character_gender'].unique():
    filtered_data = comedias_df[comedias_df['character_gender'] == i]
    print(i)
    print(filtered_data['words_spoken'].mean())
    print(filtered_data['words_spoken'].median())
    print(filtered_data['words_spoken'].std())
    print(filtered_data['words_spoken'].min())
    print(filtered_data['words_spoken'].max())

# Model Results

## No Aggregation

In [None]:
# print proportion of rows where df['is_male'] == df['predictions']
def proportion_correct(tokens_df, row_of_interest = 'predictions'):
    total = 0
    correct = 0
    male_correct = 0
    female_correct = 0
    male_incorrect = 0
    female_incorrect = 0

    for index, row in tokens_df.iterrows():
        if row['is_male'] == row[row_of_interest]:
            correct += 1
            if row['is_male'] == 1:
                male_correct += 1
            else:
                female_correct += 1

        else:
            if row['is_male'] == 1:
                male_incorrect += 1
            
            else:
                female_incorrect += 1
                

        total += 1

    if female_correct!=0 and male_correct!=0: #(male_correct + male_incorrect) != 0 and (female_correct + female_incorrect) != 0 and (male_correct + female_incorrect) !=0 and (female_correct + male_incorrect) !=0 and 
        m_precision = male_correct/(male_correct + female_incorrect)
        m_recall = male_correct / (male_correct + male_incorrect)
        m_f1 = 2 * (m_precision * m_recall) / (m_precision + m_recall)
        print('Male F1: ',m_f1)

        f_precision = female_correct / (female_correct + male_incorrect)
        f_recall = female_correct / (female_correct + female_incorrect)
        f_f1 = 2 * (f_precision * f_recall) / (f_precision + f_recall)
        print('Female F1: ',f_f1)

        average_precision = (m_precision + f_precision) / 2
        average_recall = (m_recall + f_recall) / 2
        average_f1 = (m_f1 + f_f1) / 2
        print('Average Precision: ', average_precision)
        print('Average Recall: ', average_recall)
        print('Average F1: ', average_f1)
    else:
        print('Model made no correct predicitons for one class')
        average_precision = 0
        average_recall = 0
        average_f1 = 0
    return average_precision, average_recall, average_f1

## Aggregation

In [None]:
def convert_string_to_numbers(input_string):
    numbers = input_string.strip('[]').split()
    numbers = [float(num) for num in numbers]
    return numbers

def geometric_mean_probability(df):
    result_list = []

    for (character_id, play_id), group in df.groupby(['character_id', 'id']):
        # Extract the first and second numbers after '[' and ']'
        probabilities = group['probabilities'].apply(lambda x: convert_string_to_numbers(x)[0])  # First number
        second_probabilities = group['probabilities'].apply(lambda x: convert_string_to_numbers(x)[1])  # Second number


        # Use the mean of the second probabilities for is_male == 1
        if group['is_male'].iloc[0] == 1:
            probabilities = second_probabilities

        geometric_mean_prob = probabilities.prod() ** (1 / len(probabilities))
        mean_actual = group['is_male'].mean()

        # Round the mean prediction to either 0 or 1
        mean_predict = round(geometric_mean_prob)

        #column_name = group[column_name].iloc[0]

        result_list.append({
            'id': play_id,
            'character_id': character_id,
            'geometric_mean_probability': geometric_mean_prob,
            'is_male': mean_actual,
            'geo_predictions': mean_predict,
            'average_prediction' : group['predictions'].mean()
        })

    result_df = pd.DataFrame(result_list)
    return result_df



## Precision, Recall, and F1-score for each level of text input

### Character Level
All lines a character speaks in a play

In [None]:
tokens_df = pd.read_csv('/projekte/tcl/users/keithan/projectcalderon/wp1-semantic-analysis/gender-predict-pkg/results/tokens_bert-base-spanish-wwm-cased_1e-05_24_5.csv')
print("Character Level Predictions")
proportion_correct(tokens_df)

### Scene Level
All lines spoken by a character in a scene

In [None]:
scenes_df = pd.read_csv('/projekte/tcl/users/keithan/projectcalderon/wp1-semantic-analysis/gender-predict-pkg/results/scenes_bert-base-spanish-wwm-cased_1e-05_32_12.csv')
print("Scenes Predictions")
proportion_correct(scenes_df)

print("Scenes Mean Predictions")
geo_mean = geometric_mean_probability(scenes_df)

proportion_correct(geo_mean, 'average_prediction')

print("Scenes Geometric Mean Predictions")
proportion_correct(geo_mean, 'geo_predictions')


### Utterances
Each line spoken by a character

In [None]:
utterances_df = pd.read_csv('/projekte/tcl/users/keithan/projectcalderon/wp1-semantic-analysis/gender-predict-pkg/results/utterances_bert-base-spanish_1e-05_32_14.csv')

print("Utterances Predictions")
proportion_correct(utterances_df)

print("Utterances Mean Predictions")
geo_mean = geometric_mean_probability(utterances_df)
proportion_correct(geo_mean, 'average_prediction')

print("Utterances Geometric Mean Predictions")
proportion_correct(geo_mean, 'geo_predictions')


# Analyze Predictions

Convert probabilites to single values rather than touples

In [None]:
def convert_probabilities(df):
    df['probabilities'] = df['probabilities'].apply(lambda x: (convert_string_to_numbers(x))[0])
    for index, row in df.iterrows():
        if row['probabilities'] < .5:        
            df.loc[index,'probabilities'] = 1 - row['probabilities']

In [None]:
convert_probabilities(masked_tokens_df)