In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import json

nltk.download('punkt')

def load_words_from_csv(filepath):
    df = pd.read_csv(filepath)
    ewords = set(df['eword'].dropna().str.lower())
    hwords = set(df['hword'].dropna().str.lower())
    return ewords, hwords

def preprocess_text(text):
    # Tokenize and remove punctuation
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

def is_oov(word, vocab):
    return word not in vocab

def analyze_transcripts(transcripts_filepath, vocab):
    with open(transcripts_filepath, 'r', encoding='utf-8') as file:
        transcripts = json.load(file)

    total_words = 0
    oov_count = 0

    for transcript in transcripts:
        # Assuming 'transcription' is the key where the text is stored
        if 'transcription' in transcript:
            words_list = preprocess_text(transcript['transcription'])
            total_words += len(words_list)

            for word in words_list:
                if is_oov(word, vocab):
                    oov_count += 1

    known_count = total_words - oov_count

    # Write results to file
    with open(oov_file_path, 'w', encoding='utf-8') as file:
        file.write(f'Total Words: {total_words}\n')
        file.write(f'OOV Words: {oov_count}\n')
        file.write(f'Known Words: {known_count}\n')

    print('Counts written to oov_counts.txt')

if __name__ == "__main__":
    # Load words from CSV
    ewords, hwords = load_words_from_csv(hindi_english_dictionary_file_path)

    # Combine all word lists into one set
    vocab = ewords.union(hwords)

    # Analyze transcripts
    analyze_transcripts(transcript_path, vocab)


In [None]:
import matplotlib.pyplot as plt

# Example counts (replace with your actual data)
total_words = # Total number of words in your transcripts
oov_count = # Number of OOV words
known_count = total_words - oov_count  # Number of known words

# Data for the pie chart
labels = 'OOV Words', 'Known Words'
sizes = [oov_count, known_count]
colors = ['skyblue', 'salmon']
explode = (0.1, 0)  # explode the 1st slice (OOV Words)

# Create pie chart
plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Hate')

# Save the pie chart as an image files
plt.savefig('oov_pie_chart.png')

# Display the pie chart
plt.show()