In [27]:
import gensim
import numpy as np
import pandas as pd
import os
from collections import defaultdict, Counter

# Base directory path
base_dir = '/Users/aaryansingh/Non-Native Influences Investigation/Output'

# List of languages
languages = ['tamil', 'telugu', 'hindi']

# Define input files and phone2vec models dynamically
input_files = {lang: os.path.join(base_dir, f'{lang}_native_vs_non_native_phone_pairs.txt') for lang in languages}

phone2vec_models = {lang: [
    os.path.join(base_dir, f'{other_lang}_word2vec_.model') for other_lang in languages
] + [os.path.join(base_dir, 'cmu_word2vec_.model')] for lang in languages}


def load_phone2vec_model(model_path):
    """Load the phone2vec model from the given path."""
    return gensim.models.Word2Vec.load(model_path)

def get_phone_vector(model, phone):
    """Retrieve the vector for a given phone from the model, or None if the phone is not found."""
    try:
        return model.wv[phone]
    except KeyError:
        return None

def calculate_cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    if vec1 is None or vec2 is None:
        return 0.0
    # Return rounded to two decimal places
    return round(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)), 2)

def process_phone_pairs(input_file, models):
    """Process phone pairs from the input file and calculate cosine similarities for each model."""
    phone_pairs = Counter()
    similarities = defaultdict(lambda: defaultdict(float))
    closest_phones = defaultdict(lambda: defaultdict(str))
    max_similarities = defaultdict(lambda: defaultdict(float))
    
    with open(input_file, 'r') as f:
        for line in f:
            phone1, phone2 = line.strip().split()
            if phone1 == phone2:
                continue
            phone_pairs[(phone1, phone2)] += 1
            for model_path, model in models.items():
                if phone1 == '-' or phone2 == '-':
                    similarities[model_path][(phone1, phone2)] = 0.0
                    closest_phones[model_path][(phone1, phone2)] = '-'
                    max_similarities[model_path][(phone1, phone2)] = 0.0
                else:
                    vec1 = get_phone_vector(model, phone1)
                    vec2 = get_phone_vector(model, phone2)
                    similarity = calculate_cosine_similarity(vec1, vec2)
                    similarities[model_path][(phone1, phone2)] = similarity
                    
                    # Find closest phone
                    max_similarity = 0.0
                    closest_phone = '-'
                    for other_phone in model.wv.index_to_key:
                        if other_phone != phone1:
                            other_vec = get_phone_vector(model, other_phone)
                            other_similarity = calculate_cosine_similarity(vec1, other_vec)
                            if other_similarity > max_similarity:
                                max_similarity = other_similarity
                                closest_phone = other_phone
                    closest_phones[model_path][(phone1, phone2)] = closest_phone
                    max_similarities[model_path][(phone1, phone2)] = max_similarity
    
    return phone_pairs, similarities, closest_phones, max_similarities

def save_cosine_similarities_to_csv(input_file, phone_pairs, similarities, closest_phones, max_similarities):
    """Save the phone pairs and their cosine similarities to a CSV file."""
    output_file = os.path.join('/Users/aaryansingh/Non-Native Influences Investigation/Results', os.path.basename(os.path.splitext(input_file)[0]) + '_cosine_similarities.csv')
    data = {
        'Phone1': [pair[0] for pair in phone_pairs],
        'Phone2': [pair[1] for pair in phone_pairs],
        'Count': [count for count in phone_pairs.values()]
    }
    
    for model_path in similarities.keys():
        model_name = os.path.splitext(os.path.basename(model_path))[0]
        data[f'CosineSimilarity_{model_name}'] = [similarities[model_path][pair] for pair in phone_pairs]
        data[f'ClosestPhone_{model_name}'] = [closest_phones[model_path][pair] for pair in phone_pairs]
        data[f'MaxSimilarity_{model_name}'] = [max_similarities[model_path][pair] for pair in phone_pairs]
    
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f'Cosine similarities written to {output_file}')

for language, input_file in input_files.items():
    models = {model_path: load_phone2vec_model(model_path) for model_path in phone2vec_models[language]}
    phone_pairs, similarities, closest_phones, max_similarities = process_phone_pairs(input_file, models)
    save_cosine_similarities_to_csv(input_file, phone_pairs, similarities, closest_phones, max_similarities)




Cosine similarities written to /Users/aaryansingh/Non-Native Influences Investigation/Results/tamil_native_vs_non_native_phone_pairs_cosine_similarities.csv
Cosine similarities written to /Users/aaryansingh/Non-Native Influences Investigation/Results/telugu_native_vs_non_native_phone_pairs_cosine_similarities.csv
Cosine similarities written to /Users/aaryansingh/Non-Native Influences Investigation/Results/hindi_native_vs_non_native_phone_pairs_cosine_similarities.csv


In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define the list of languages and their corresponding file paths
languages = {
    'hindi': '/Users/aaryansingh/Non-Native Influences Investigation/Results/hindi_native_vs_non_native_phone_pairs_cosine_similarities.csv',
    'telugu': '/Users/aaryansingh/Non-Native Influences Investigation/Results/telugu_native_vs_non_native_phone_pairs_cosine_similarities.csv',
    'tamil': '/Users/aaryansingh/Non-Native Influences Investigation/Results/tamil_native_vs_non_native_phone_pairs_cosine_similarities.csv'
}

output_dir = '/Users/aaryansingh/Non-Native Influences Investigation/Results/Plots'
os.makedirs(output_dir, exist_ok=True)

for language, file_path in languages.items():
    # Load the data
    data = pd.read_csv(file_path)
    
    # Get the phone pairs and their max similarities
    max_similarities = data[[col for col in data.columns if 'MaxSimilarity' in col]]
    
    # Plot the max similarity phone for each phone in all languages
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=max_similarities, orient='h')
    plt.xlabel('Max Cosine Similarity')
    plt.ylabel('Phone Pair')
    plt.title(f'Max Cosine Similarity for Phone Pairs in {language.capitalize()} 3min_3w')
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, f'{language}_phone_pairs_max_similarity.png'))
    plt.close()


