In [9]:
import gensim
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Base directories for input and output files
input_base_dir = '/Users/aaryansingh/Non-Native Influences Investigation/Languages/Transcripts'
output_base_dir = '/Users/aaryansingh/Non-Native Influences Investigation/Results/Plots'

# List of languages
languages = ['telugu', 'tamil', 'hindi']

# Define the input and output file paths dynamically
input_files = {
    lang: {
        'native': os.path.join(input_base_dir, lang.capitalize(), f'{lang}_arpa.txt'),
        'non_native': os.path.join(input_base_dir, lang.capitalize(), f'{lang}_spk_arpa.txt')
    } for lang in languages
}

output_files = {
    lang: {
        'native': os.path.join(output_base_dir, f'{lang}_native_vs_non_native_phone_pairs.txt')
    } for lang in languages
}

def read_transcripts(native_file, non_native_file):
    word_pronunciation_pairs = []
    with open(native_file, 'r') as native, open(non_native_file, 'r') as non_native:
        native_lines = native.readlines()
        non_native_lines = non_native.readlines()
        if len(native_lines) != len(non_native_lines):
            raise ValueError('Number of sentences in native and non-native transcripts do not match')
        for native_line, non_native_line in zip(native_lines, non_native_lines):
            native_words = native_line.strip().split("  ")
            non_native_words = non_native_line.strip().split("  ")
            if len(native_words) != len(non_native_words):
                print('Number of words in native and non-native sentences do not match')
                continue
            for native_word, non_native_word in zip(native_words, non_native_words):
                word_pronunciation_pairs.append((native_word, non_native_word))
    return word_pronunciation_pairs

def align_pronunciation(native_pronunciation, non_native_pronunciation):
    distance_matrix = np.zeros((len(native_pronunciation) + 1, len(non_native_pronunciation) + 1))
    distance_matrix[0, :] = np.arange(len(non_native_pronunciation) + 1)
    distance_matrix[:, 0] = np.arange(len(native_pronunciation) + 1)

    for i in range(1, len(native_pronunciation) + 1):
        for j in range(1, len(non_native_pronunciation) + 1):
            if native_pronunciation[i - 1] == non_native_pronunciation[j - 1]:
                distance_matrix[i, j] = distance_matrix[i - 1, j - 1]
            else:
                distance_matrix[i, j] = min(distance_matrix[i - 1, j - 1], distance_matrix[i - 1, j], distance_matrix[i, j - 1]) + 1

    alignment = []
    i, j = len(native_pronunciation), len(non_native_pronunciation)
    while i > 0 and j > 0:
        if native_pronunciation[i - 1] == non_native_pronunciation[j - 1]:
            alignment.append((native_pronunciation[i - 1], non_native_pronunciation[j - 1]))
            i -= 1
            j -= 1
        elif distance_matrix[i - 1, j - 1] <= distance_matrix[i - 1, j] and distance_matrix[i - 1, j - 1] <= distance_matrix[i, j - 1]:
            alignment.append((native_pronunciation[i - 1], non_native_pronunciation[j - 1]))
            i -= 1
            j -= 1
        elif distance_matrix[i - 1, j] <= distance_matrix[i - 1, j - 1] and distance_matrix[i - 1, j] <= distance_matrix[i, j - 1]:
            alignment.append((native_pronunciation[i - 1], '-'))
            i -= 1
        else:
            alignment.append(('-', non_native_pronunciation[j - 1]))
            j -= 1
    while i > 0:
        alignment.append((native_pronunciation[i - 1], '-'))
        i -= 1
    while j > 0:
        alignment.append(('-', non_native_pronunciation[j - 1]))
        j -= 1
    alignment.reverse()
    return alignment

def process_language(language, input_files, output_files):
    word_pronunciation_pairs = read_transcripts(input_files[language]['native'], input_files[language]['non_native'])
    aligned_pronunciations = [align_pronunciation(native_pron.split(), non_native_pron.split()) for native_pron, non_native_pron in word_pronunciation_pairs]

    phoneme_pairs_count = {}
    for alignment in aligned_pronunciations:
        for native_phone, non_native_phone in alignment:
            if native_phone != '-' and non_native_phone != '-' and native_phone != non_native_phone:
                pair = (native_phone, non_native_phone)
                if pair not in phoneme_pairs_count:
                    phoneme_pairs_count[pair] = 0
                phoneme_pairs_count[pair] += 1

    # Create DataFrame for heatmap
    native_phones = sorted(set(pair[0] for pair in phoneme_pairs_count.keys()))
    non_native_phones = sorted(set(pair[1] for pair in phoneme_pairs_count.keys()))
    heatmap_data = pd.DataFrame(0, index=native_phones, columns=non_native_phones)

    for (native_phone, non_native_phone), count in phoneme_pairs_count.items():
        heatmap_data.loc[native_phone, non_native_phone] = count

    # Plot heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu', vmin=0, vmax=10)  # Adjust vmin and vmax for better appearance
    plt.title(f'Phoneme Exchange Heatmap for {language.capitalize()}')
    plt.savefig(f'{output_base_dir}/{language}_phoneme_heatmap.png')
    plt.close()

for language in input_files.keys():
    process_language(language, input_files, output_files)
    print(f'Processed {language} and wrote aligned pronunciations to {output_files[language]["native"]}')
    print(f'Heatmap for {language} saved as {output_base_dir}/{language}_phoneme_heatmap.png')


Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Number of words in native and non-native sentences do not match
Processed telugu and wrote aligned pronunciations to /Users/aaryansingh/Non-Native Influences Investigation/Output/telugu_native_vs_non_native_phone_pairs.txt
Heatmap f