In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

# Analyze Corpus

Let's first look at the number of words spoken by male and female characters in calderon's comedias

In [None]:
character_file = 'calderon-gender-prediction/all_characters.csv'
character_df = pd.read_csv(character_file, usecols = ['id','genre','character_gender','character_id', 'scenes', 'utterances', 'tokens', 'words_spoken'])

#only examine comedias files, not autos, loas, or zarzuelas
comedias_df = character_df[(character_df['genre'] != 'auto sacramental') & 
                          (character_df['genre'] != 'loa') & 
                          (character_df['genre'] != 'zarzuela') & 
                          (character_df['genre'] != 'mojiganga')]

#drop nan values

comedias_df = comedias_df.dropna()
comedias_df = comedias_df[comedias_df['words_spoken'] > 30]
print(comedias_df.shape)
print(comedias_df['genre'].value_counts())

In [None]:
print(comedias_df['id'].unique().shape)

In [None]:
#chart the number of words spoken by male and female characters
for i in comedias_df['character_gender'].unique():
    filtered_data = comedias_df[comedias_df['character_gender'] == i]
    plt.hist(filtered_data['words_spoken'], label=i)
    plt.title('Number of Words Spoken by Character Gender')
    plt.xlabel('Number of Tokens Spoken')
    plt.ylabel('Number of Characters')
    plt.legend()
plt.show()

Print the total number of words spoken by men and by women

In [None]:
#print the sum of the words spoken coulmn for each 
print(comedias_df.groupby('character_gender')['words_spoken'].sum())

#print how many characters
print(comedias_df.groupby('character_gender')['character_id'].nunique()) #but some names are repeated

In [None]:
#how many characters in the corpus speak more than 30 words
print(comedias_df[comedias_df['words_spoken'] > 30].shape)

#count number of rows in the dataframe with male or female and > 30 words spoken
print(comedias_df[comedias_df['words_spoken'] > 30]['character_gender'].value_counts())

Let's get the max, min, and mean of the number of words spoken by gender

In [None]:
for i in comedias_df['character_gender'].unique():
    filtered_data = comedias_df[comedias_df['character_gender'] == i]
    print(i)
    print(filtered_data['words_spoken'].mean())
    print(filtered_data['words_spoken'].median())
    print(filtered_data['words_spoken'].std())
    print(filtered_data['words_spoken'].min())
    print(filtered_data['words_spoken'].max())

# Model Results

## No Aggregation

In [None]:
# print proportion of rows where df['is_male'] == df['predictions']
def proportion_correct(tokens_df, row_of_interest = 'predictions'):
    total = 0
    correct = 0
    male_correct = 0
    female_correct = 0
    male_incorrect = 0
    female_incorrect = 0

    for index, row in tokens_df.iterrows():
        if row['is_male'] == row[row_of_interest]:
            correct += 1
            if row['is_male'] == 1:
                male_correct += 1
            else:
                female_correct += 1

        else:
            if row['is_male'] == 1:
                male_incorrect += 1
            
            else:
                female_incorrect += 1
                

        total += 1

    if female_correct!=0 and male_correct!=0: #(male_correct + male_incorrect) != 0 and (female_correct + female_incorrect) != 0 and (male_correct + female_incorrect) !=0 and (female_correct + male_incorrect) !=0 and 
        m_precision = male_correct/(male_correct + female_incorrect)
        m_recall = male_correct / (male_correct + male_incorrect)
        m_f1 = 2 * (m_precision * m_recall) / (m_precision + m_recall)
        print('Male F1: ',m_f1)

        f_precision = female_correct / (female_correct + male_incorrect)
        f_recall = female_correct / (female_correct + female_incorrect)
        f_f1 = 2 * (f_precision * f_recall) / (f_precision + f_recall)
        print('Female F1: ',f_f1)

        average_precision = (m_precision + f_precision) / 2
        average_recall = (m_recall + f_recall) / 2
        average_f1 = (m_f1 + f_f1) / 2
        print('Average Precision: ', average_precision)
        print('Average Recall: ', average_recall)
        print('Average F1: ', average_f1)
    else:
        print('Model made no correct predicitons for one class')
        average_precision = 0
        average_recall = 0
        average_f1 = 0
    return average_precision, average_recall, average_f1

## Aggregation

In [None]:
def convert_string_to_numbers(input_string):
    numbers = input_string.strip('[]').split()
    numbers = [float(num) for num in numbers]
    return numbers

def geometric_mean_probability(df):
    result_list = []

    for (character_id, play_id), group in df.groupby(['character_id', 'id']):
        # Extract the first and second numbers after '[' and ']'
        probabilities = group['probabilities'].apply(lambda x: convert_string_to_numbers(x)[0])  # First number
        second_probabilities = group['probabilities'].apply(lambda x: convert_string_to_numbers(x)[1])  # Second number


        # Use the mean of the second probabilities for is_male == 1
        if group['is_male'].iloc[0] == 1:
            probabilities = second_probabilities

        geometric_mean_prob = probabilities.prod() ** (1 / len(probabilities))
        mean_actual = group['is_male'].mean()

        # Round the mean prediction to either 0 or 1
        mean_predict = round(geometric_mean_prob)

        #column_name = group[column_name].iloc[0]

        result_list.append({
            'id': play_id,
            'character_id': character_id,
            'geometric_mean_probability': geometric_mean_prob,
            'is_male': mean_actual,
            'geo_predictions': mean_predict,
            'average_prediction' : group['predictions'].mean()
        })

    result_df = pd.DataFrame(result_list)
    return result_df



## Precision, Recall, and F1-score for each level of text input

### Character Level
All lines a character speaks in a play

In [None]:
tokens_df = pd.read_csv('/calderon-gender-prediction/results/tokens_bert-base-spanish-wwm-cased_1e-05_24_5.csv')
print("Character Level Predictions")
proportion_correct(tokens_df)

### Scene Level
All lines spoken by a character in a scene

In [None]:
scenes_df = pd.read_csv('/calderon-gender-prediction/results/scenes_bert-base-spanish-wwm-cased_1e-05_32_12.csv')
print("Scenes Predictions")
proportion_correct(scenes_df)

print("Scenes Mean Predictions")
geo_mean = geometric_mean_probability(scenes_df)

proportion_correct(geo_mean, 'average_prediction')

print("Scenes Geometric Mean Predictions")
proportion_correct(geo_mean, 'geo_predictions')


### Utterances
Each line spoken by a character

In [None]:
utterances_df = pd.read_csv('/calderon-gender-prediction/results/utterances_bert-base-spanish_1e-05_32_14.csv')

print("Utterances Predictions")
proportion_correct(utterances_df)

print("Utterances Mean Predictions")
geo_mean = geometric_mean_probability(utterances_df)
proportion_correct(geo_mean, 'average_prediction')

print("Utterances Geometric Mean Predictions")
proportion_correct(geo_mean, 'geo_predictions')


# Analyze Predictions

Convert probabilites to single values rather than touples

In [None]:
def convert_probabilities(df):
    df['probabilities'] = df['probabilities'].apply(lambda x: (convert_string_to_numbers(x))[0])
    for index, row in df.iterrows():
        if row['probabilities'] < .5:        
            df.loc[index,'probabilities'] = 1 - row['probabilities']

In [None]:
convert_probabilities(masked_tokens_df)

Check to see the most confident wrong predictions from the model

In [None]:
# find columns where is_male != predictions and probabilities corresponding to predicitons is extreme 

def find_misclassified(df):
    misclassified_df = df[df['is_male'] != df['predictions']]

    print(misclassified_df['character_gender'].value_counts())
    print(misclassified_df['words_spoken'].mean())
    print(misclassified_df['words_spoken'].std())
    
    return misclassified_df
    

In [None]:
misclassified_df = find_misclassified(masked_tokens_df)
misclassified_df = misclassified_df[misclassified_df['words_spoken'] > 512]

I want to see if there's a relationship between the number of words spoken by a character and the model's confidence in its prediction

In [None]:
print(masked_tokens_df['words_spoken'].mean())
print(misclassified_df['words_spoken'].mean())

In [None]:
print(masked_tokens_df['character_gender'].value_counts())

## Find Misclassified Characters


In [None]:
misclassified_male = misclassified_df[misclassified_df['character_gender']=='MALE']
misclassified_male = misclassified_male[["id","genre","character_id","character_gender","words_spoken","tokens","probabilities"]]
misclassified_male

In [None]:
misclassified_female = misclassified_df[misclassified_df['character_gender']=='FEMALE']
misclassified_female = misclassified_female[["id","genre","character_id","character_gender","words_spoken","tokens","probabilities"]]
misclassified_female

# Plot Model Confidence vs. Number of Words Spoken

In [None]:
# create a scatter plot of the number of words and the probability

def probabilites_by_words_spoken(df):

    #change the color of the points based on if the prediction was correct or not
    colors = []
    for index, row in df.iterrows():
        # Check if the prediction matches the ground truth label
        if row['predictions'] == row['is_male']:
            colors.append('green')  # Correct prediction
        else:
            colors.append('red')   # Incorrect prediction


    


    plt.scatter(df['words_spoken'], df['probabilities'], c=colors)
    plt.xlabel('Number of Words Spoken')
    plt.ylabel('Probability')
    plt.title('Probability of Model Prediction by Number of Words Spoken')

In [None]:
probabilites_by_words_spoken(masked_tokens_df)

In [None]:
Add character names of main characters to the plot

In [None]:
# create a scatter plot of the number of words and the probability

def probabilites_by_words_spoken_main(df):
    main_df = df[df['words_spoken'] > 2000] 
    #change the color of the points based on if the prediction was correct or not
    colors = []
    for index, row in main_df.iterrows():
        # if row['words_spoken'] > 1000:
            # Check if the prediction matches the ground truth label
        if row['predictions'] == row['is_male']:
            colors.append('green')  # Correct prediction
        else:
            colors.append('red')   # Incorrect prediction

        # add label to the points row['character_id']



    


    plt.scatter(main_df['words_spoken'], main_df['probabilities'], c=colors)
    plt.xlabel('Number of Words Spoken')
    plt.ylabel('Probability')
    plt.title('Probability of Model Prediction by Number of Words Spoken')


    

    ax = main_df.plot(x='words_spoken',y='probabilities',kind='scatter',figsize=(10,10), c=colors)

    #rotate text by 45 degrees  

    main_df[['words_spoken','probabilities','character_id']].apply(lambda x: ax.text(*x),axis=1) 

High confidence is due to the nature of cross-entropy loss
so basically, the model is usually confident, but less confident when there are fewer words spoken, and accuracy increases the more words that are spoken

In [None]:
probabilites_by_words_spoken_main(masked_tokens_df)

In [None]:
# create a scatter plot of the number of words and the probability

def probabilites_by_words_spoken_female(df):
    main_df = df[(df['words_spoken'] > 2000)  & (df['probabilities'] > .50) & (df['is_male'] == 0)]
    print(main_df['probabilities'].describe()) 
    
    #change the color of the points based on if the prediction was correct or not
    colors = []
    for index, row in main_df.iterrows():
        # if row['words_spoken'] > 1000:
            # Check if the prediction matches the ground truth label
        if row['predictions'] == row['is_male']:
            colors.append('green')  # Correct prediction
        else:
            colors.append('red')   # Incorrect prediction

        # add label to the points row['character_id']

    ax = main_df.plot(x='words_spoken',y='probabilities',kind='scatter',figsize=(10,10), c=colors)

    #rotate text by 45 degrees  

    main_df[['words_spoken','probabilities','character_id']].apply(lambda x: ax.text(*x, size='small',rotation=40),axis=1) 

In [None]:
probabilites_by_words_spoken_female(masked_tokens_df) 

### Model's most confident predictions for male and female characters

In [None]:
masked_correct_df = masked_tokens_df[masked_tokens_df['is_male'] == masked_tokens_df['predictions']]

male_correct = masked_correct_df[masked_correct_df['is_male'] == 1]
female_correct = masked_correct_df[masked_correct_df['is_male']== 0]

In [None]:
### Most "Male" characters
of the test set

In [None]:
#print the 10 rows with the highest probability
male_correct = male_correct[male_correct['words_spoken'] > 512]
male_correct = male_correct[["id","character_id","character_gender","words_spoken","tokens","probabilities"]]
male_correct.nlargest(10, 'probabilities')

In [None]:
#print the 10 rows with the highest probability
female_correct = female_correct[female_correct['words_spoken'] > 512]
female_correct = female_correct[["id","character_id","character_gender","words_spoken","tokens","probabilities"]]
female_correct.nlargest(10, 'probabilities')

### Most Male & Female Characters after masking

In [None]:
#print the 10 rows with the highest probability
print(male_correct.nlargest(10, 'probabilities'))

In [None]:
print(female_correct.nlargest(10, 'probabilities'))

### Scatter plot of scenes probabilities

In [None]:
# count the number of words spoken in each scene
masked_scenes_df['words_spoken'] = masked_scenes_df['scenes'].apply(lambda x: len(x.split()))

In [None]:
convert_probabilities(masked_scenes_df)

In [None]:
masked_scenes_df

In [None]:
#lets look at this scatter plot for indivudual characters
rosaura = masked_scenes_df[masked_scenes_df['character_id'] == 'rosaura']

probabilites_by_words_spoken(rosaura)

# Results by gender

In [None]:
female = masked_scenes_df[masked_scenes_df["is_male"]== 0]
male = masked_scenes_df[masked_scenes_df["is_male"]== 1]

In [None]:
def precision(true_positive, false_positive):
    #calculate the precision of the model
    precision = true_positive / (true_positive + false_positive)
    return precision
    #true positive = the number of correctly predicted

In [None]:
def recall(true_positive, false_negative):
    #calculate the recall of the model
    recall = true_positive / (true_positive + false_negative)
    return recall

In [None]:
def f1_score(precision, recall):
    #calculate the f1 score of the model
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
def accuracy(true_positive, total):
    #calculate the accuracy of the model
    accuracy = true_positive / total
    return accuracy

In [None]:
female
true_positive = len(female[female['is_male']==female['predictions']])
total = len(female)
accuracy_female = accuracy(true_positive, total)
accuracy_female

In [None]:
male 
true_positive = len(male[male['is_male']==male['predictions']])
total = len(male)
accuracy_male = accuracy(true_positive, total)
accuracy_male

# Input size as a confounding variable

In [None]:
# add varialbe length of speech
masked_scenes_df['length'] = masked_scenes_df['scenes'].apply(lambda x: len(x.split()))

#if the input is more than 512 tokens, the model will not be able to process it, so change all values greater than 512 to 512
masked_scenes_df['length'] = masked_scenes_df['length'].apply(lambda x: 512 if x > 512 else x)

In [None]:
masked_scenes_df['quartiles'] = pd.qcut(masked_scenes_df['length'], q=4, labels=False)

In [None]:
quartiles = masked_scenes_df.groupby('quartiles')

quartiles.describe().head()


In [None]:
sq1 = masked_scenes_df[masked_scenes_df['quartiles'] == 0]
sq2 = masked_scenes_df[masked_scenes_df['quartiles'] == 1]
sq3 = masked_scenes_df[masked_scenes_df['quartiles'] == 2]
sq4 = masked_scenes_df[masked_scenes_df['quartiles'] == 3]

In [None]:
proportion_correct(sq1)

In [None]:
proportion_correct(sq2)

In [None]:
proportion_correct(sq3)

In [None]:
proportion_correct(sq4)

## Average probability Cross-dressing vs non cross-dressing

In [None]:
masked_tokens_df
# cross_dressed = ['lindabridis', 'claridiana', 'rosaura', 'eugenia', 'semíramis']
cross_dressed = masked_tokens_df.head(5)
cross_dressed

In [None]:
cross_dressed.describe().head()