In [2]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize




df = pd.read_csv('testset.csv')


def calculate_sentence_bleu(reference, hypothesis):
    
    ref_tokens = word_tokenize(reference)
    hyp_tokens = word_tokenize(hypothesis)
    return sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5))

df['BLEU_translation_correct'] = df.apply(lambda row: calculate_sentence_bleu(row['translation_correct'], row['Transformer_pred']), axis=1)
df['BLEU_translation_wrong'] = df.apply(lambda row: calculate_sentence_bleu(row['translation_wrong'], row['Transformer_pred']), axis=1)
df['BLEU_English_correct'] = df.apply(lambda row: calculate_sentence_bleu(row['English_correct'], row['Transformer_pred']), axis=1)
df['BLEU_English_wrong'] = df.apply(lambda row: calculate_sentence_bleu(row['English_wrong'], row['Transformer_pred']), axis=1)

df.to_csv('output_with_bleu_scores.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guzhengwei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n

In [3]:
import pandas as pd

# Load the data
df = pd.read_csv('output_with_bleu_scores.csv')

# Compare BLEU scores and mark the presence of priming effect
df['Priming_effect_translation'] = df['BLEU_translation_correct'] > df['BLEU_translation_wrong']
df['Priming_effect_non_translation'] = df['BLEU_English_correct'] > df['BLEU_English_wrong']

# Add columns for the priming effect result
df['Translation_Priming_Result'] = df.apply(lambda x: 'Yes' if x['Priming_effect_translation'] else 'No', axis=1)
df['Non_translation_Priming_Result'] = df.apply(lambda x: 'Yes' if x['Priming_effect_non_translation'] else 'No', axis=1)

# Save the updated DataFrame to a new CSV file
output_file_path = 'output_with_priming_results.csv'
df.to_csv(output_file_path, index=False)


In [5]:
# First, we'll need to read the CSV file into a pandas DataFrame.
df = pd.read_csv('output_with_priming_results.csv')

# Define the chunk size
chunk_size = 30

# Function to calculate the proportion of 'True' values within each chunk for a given column
def calculate_chunk_proportion(column):
    # Create an empty series to store the proportions
    proportions = pd.Series(index=df.index, dtype=float)
    
    # Iterate over the DataFrame in steps of `chunk_size`
    for start in range(0, len(df), chunk_size):
        end = start + chunk_size
        chunk = df[column][start:end]
        proportion = chunk.sum() / chunk_size
        proportions[start:end] = proportion
    
    return proportions

# Calculate proportions for 'Priming_effect_translation' and 'Priming_effect_non_translation'
df['Prime_Type_Translation'] = calculate_chunk_proportion('Priming_effect_translation')
df['Prime_Type_Non_Translation'] = calculate_chunk_proportion('Priming_effect_non_translation')

# Drop the original priming effect columns as they are not needed for the output file
df.drop(columns=['Priming_effect_translation', 'Priming_effect_non_translation'], inplace=True)

# Save the new DataFrame to a CSV file
new_csv_path = 'priming_proportion_per_chunk.csv'
df.to_csv(new_csv_path, index=False)



In [9]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction


# Load the dataset
df = pd.read_csv('testset.csv')

# Function to calculate average BLEU scores for given n-grams
def calculate_average_bleu(df, ref_column, hyp_column):
    scores = {n: [] for n in range(1, 5)}
    
    for _, row in df.iterrows():
        reference = [word_tokenize(row[ref_column].lower())]
        hypothesis = word_tokenize(row[hyp_column].lower())
        
        for n in range(1, 5):
            weight = tuple((1/n,) * n + (0,) * (4-n))
            score = sentence_bleu(reference, hypothesis, weights=weight, smoothing_function=SmoothingFunction().method1)
            scores[n].append(score)
    
    # Calculate the average score for each n-gram level
    average_scores = {f'Average BLEU-{n}': sum(scores[n])/len(scores[n]) for n in scores}
    return average_scores

# Calculate average BLEU scores for each column pair
avg_bleu_translation_correct = calculate_average_bleu(df, 'translation_correct', 'Transformer_pred')
avg_bleu_translation_wrong = calculate_average_bleu(df, 'translation_wrong', 'Transformer_pred')
avg_bleu_English_correct = calculate_average_bleu(df, 'English_correct', 'Transformer_pred')
avg_bleu_English_wrong = calculate_average_bleu(df, 'English_wrong', 'Transformer_pred')

# Print or store the results as needed
print("Average BLEU Scores for Translation Correct vs. Transformer_pred:", avg_bleu_translation_correct)
print("Average BLEU Scores for Translation Wrong vs. Transformer_pred:", avg_bleu_translation_wrong)
print("Average BLEU Scores for English Correct vs. Transformer_pred:", avg_bleu_English_correct)
print("Average BLEU Scores for English Wrong vs. Transformer_pred:", avg_bleu_English_wrong)


Average BLEU Scores for Translation Correct vs. Transformer_pred: {'Average BLEU-1': 0.8043645998576265, 'Average BLEU-2': 0.7030482166851667, 'Average BLEU-3': 0.5977987647484042, 'Average BLEU-4': 0.5277194839810125}
Average BLEU Scores for Translation Wrong vs. Transformer_pred: {'Average BLEU-1': 0.6523497536538, 'Average BLEU-2': 0.42130098667302585, 'Average BLEU-3': 0.24803455393499932, 'Average BLEU-4': 0.1637474205303424}
Average BLEU Scores for English Correct vs. Transformer_pred: {'Average BLEU-1': 0.43888146927882005, 'Average BLEU-2': 0.1897479025528721, 'Average BLEU-3': 0.0910041568887471, 'Average BLEU-4': 0.06338391289380382}
Average BLEU Scores for English Wrong vs. Transformer_pred: {'Average BLEU-1': 0.3426083975340467, 'Average BLEU-2': 0.10702960145755797, 'Average BLEU-3': 0.05454627015362661, 'Average BLEU-4': 0.04061731343071909}
