In [8]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [9]:
df = pd.read_csv("pronoun_testcases.csv")

In [10]:
# Define a function to transform pronouns based on target gender
def transform_with_spacy(text, target_gender):
    # Process the input text using spaCy
    doc = nlp(text)
    transformed = []

    # Iterate through each token (word) in the processed text
    for token in doc:
        orig = token.text # Get the original word
        lower = token.text.lower() # Get the lowercase word
        tag = token.tag_  # Get the part-of-speech tag (e.g., PRP for pronoun, PRP$ for possessive pronoun)

        new_word = orig # Initialize the new word with the original word

        # Check the target gender and replace pronouns accordingly
        if target_gender == "female":
            if lower == "he":
                new_word = "she"
            elif lower == "him":
                new_word = "her"
            # For "his", check the tag to distinguish between possessive and non-possessive
            elif lower == "his":
                new_word = "her" if tag == "PRP$" else "his"
            elif lower == "himself":
                new_word = "herself"

        elif target_gender == "male":
            if lower == "she":
                new_word = "he"
            elif lower == "her":
                # For "her", check the tag to distinguish between possessive and non-possessive
                new_word = "his" if tag == "PRP$" else "him"
            elif lower == "hers":
                new_word = "his"
            elif lower == "herself":
                new_word = "himself"

        # Capitalize the new word if the original word was capitalized
        if orig[0].isupper():
            new_word = new_word.capitalize()

        # Append the new word and its trailing whitespace to the transformed list
        transformed.append((new_word, token.whitespace_))

    # Join the transformed words and their whitespaces to form the final string
    return ''.join([w + ws for w, ws in transformed])

# Apply the transform_with_spacy function to each row of the DataFrame
# This creates a new column 'predicted_output' with the transformed text
df['predicted_output'] = df.apply(
    lambda row: transform_with_spacy(row['input_text'], row['target_gender']),
    axis=1
)

In [11]:
# Compare the predicted output with the expected output to check for correctness
df['correct'] = df['predicted_output'] == df['expected_output']
# Count the number of correct transformations
correct = df['correct'].sum()
# Get the total number of test cases
total = len(df)
# Calculate the accuracy
accuracy = correct / total * 100

# Print the accuracy results
print(f"\n Accuracy: {correct}/{total} correct ({accuracy:.2f}%)")


 Accuracy: 26/26 correct (100.00%)
