In [1]:
import re
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics
import os

In [2]:
def extract_srt_text(srt_content):
    # Split content into blocks (each subtitle entry)
    blocks = srt_content.strip().split('\n\n')
    
    text_only = []
    
    for block in blocks:
        lines = block.split('\n')
        # Skip first two lines (index and timestamp)
        if len(lines) > 2:
            # Join remaining lines which contain the actual text
            text = ' '.join(lines[2:])
            # Remove HTML font tags
            text = re.sub(r'<font[^>]*>', '', text)
            text = re.sub(r'</font>', '', text)
            #print(text)
            text_only.append(text)
    
    # Join all text with spaces
    return text_only

In [3]:
class TextAnalyzer:
    def __init__(self, text_list: list[str]):
        # Join the list elements into a single text
        self.text = ' '.join(text_list)
        self.words = word_tokenize(self.text)
        self.sentences = sent_tokenize(self.text)
        
    def word_frequency_analysis(self):
        """Analyze word frequencies"""
        # Remove punctuation and convert to lowercase
        words = [word.lower() for word in self.words if word.isalnum()]
        
        # Get word frequencies
        word_freq = Counter(words)
        
        # Get top 20 most common words
        most_common = word_freq.most_common(100)
        
        return {
            'total_words': len(words),
            'unique_words': len(word_freq),
            'most_common': most_common
        }

    def sentence_analysis(self):
        """Analyze sentence statistics"""
        # Calculate sentence lengths
        sent_lengths = [len(word_tokenize(sent)) for sent in self.sentences]
        
        return {
            'total_sentences': len(self.sentences),
            'avg_sentence_length': statistics.mean(sent_lengths),
            'max_sentence_length': max(sent_lengths),
            'min_sentence_length': min(sent_lengths)
        }

    def language_mix_analysis(self):
        """Analyze Hindi-English language mixing"""
        # Simple detection based on character sets
        devanagari = len(re.findall(r'[\u0900-\u097F]', self.text))
        latin = len(re.findall(r'[a-zA-Z]', self.text))
        
        total_chars = devanagari + latin
        
        return {
            'hindi_ratio': devanagari/total_chars if total_chars > 0 else 0,
            'english_ratio': latin/total_chars if total_chars > 0 else 0,
            'total_characters': total_chars
        }

    def readability_analysis(self):
        """Calculate basic readability metrics"""
        # Count syllables (simplified)
        def count_syllables(word):
            return len(re.findall('[aeiouAEIOU]+', word))
        
        total_syllables = sum(count_syllables(word) for word in self.words)
        
        return {
            'avg_word_length': statistics.mean([len(word) for word in self.words]),
            'syllables_per_word': total_syllables / len(self.words) if self.words else 0
        }

In [4]:
# Example usage:
with open('data/youtube/youtube_downloads/Carry/_2EgadGNebo/CANT LIVE LONG.hi.srt', 'r', encoding='utf-8') as file:
    srt_content = file.read()
    
text = extract_srt_text(srt_content)
#print(text)

In [5]:
text

['है तो कैसे हैं आप लोग पता जब मैं अपने',
 'मंडे टेस्ट पेपर को देखता हूं यार सही',
 'मुझे ऐसा लगता मैं बिल्कुल अनपढ़ हूं और',
 'कि कुछ कर नहीं पाऊंगा इतनी बुरी फीलिंग',
 'आती है न बिल्कुल खंजर दे दो यार अपने दिल',
 'में और ऊपर से व प्रिंसिपल और बोलता रहता',
 'है जूते पहन किया और स्कूल वाले अरे यार',
 'तुम्हारे स्कूल वालों के जूते देखें',
 'ब***** सस्ती क्वालिटी इतनी गत्ते पर',
 'ब***** कांटे चिपका दीजिए हम इतनी बुरी',
 'क्वालिटी होती अकड़न हो जाती है यार पैरों',
 'में नहीं पहन लूंगा यार मैं तुम्हारे जूते',
 'हैं और यह बात अगर कहना वाइस प्रिंसिपल को',
 'तो भाई शुरू बंदे किया ब***** सारी की',
 'सारी सब लोग पहन रहे हैं ना तुम भी पहन',
 'लाल क्या प्रॉब्लम मतलब सब लोग पागल है तो',
 'हम भी पागल हैं ना और अगर ज्यादा प्रॉब्लम',
 'हो रही है तो टीसीएल एलोवेरा भोसड़ी के',
 'मेरे मतलब क्या और तू कहां बात ले जा रहा',
 'है छोड़ो यार इन बात मैं शरीर नहीं बात',
 'मेन मुद्दे पर बात आज हमारे पास एक और',
 'रहता यार रात की जो बीमारी है ना इंडिया',
 'में इतने फैंस यह दबा के गैलरी घंटे में',
 'मतलब हर 15 मि

In [6]:
def main():
    analyzer = TextAnalyzer(text)
    
    # Perform analyses
    word_freq = analyzer.word_frequency_analysis()
    sent_stats = analyzer.sentence_analysis()
    lang_mix = analyzer.language_mix_analysis()
    readability = analyzer.readability_analysis()
    
    # Print results
    print("\n=== Word Frequency Analysis ===")
    print(f"Total words: {word_freq['total_words']}")
    print(f"Unique words: {word_freq['unique_words']}")
    print("\nMost common words:")
    for word, freq in word_freq['most_common'][:20]:
        print(f"{word}: {freq}")
        
    print("\n=== Sentence Analysis ===")
    for key, value in sent_stats.items():
        print(f"{key}: {value:.2f}")
        
    print("\n=== Language Mix Analysis ===")
    print(f"Hindi ratio: {lang_mix['hindi_ratio']:.2%}")
    print(f"English ratio: {lang_mix['english_ratio']:.2%}")
    
    print("\n=== Readability Analysis ===")
    for key, value in readability.items():
        print(f"{key}: {value:.2f}")

if __name__ == "__main__":
    main()


=== Word Frequency Analysis ===
Total words: 198
Unique words: 66

Most common words:
और: 25
यह: 17
पर: 13
कर: 11
आप: 10
एक: 9
इस: 7
वह: 6
अगर: 5
पहन: 4
ब: 4
हम: 4
मतलब: 4
जब: 3
न: 3
सब: 3
हर: 3
अ: 3
लग: 3
गई: 3

=== Sentence Analysis ===
total_sentences: 1.00
avg_sentence_length: 1221.00
max_sentence_length: 1221.00
min_sentence_length: 1221.00

=== Language Mix Analysis ===
Hindi ratio: 99.74%
English ratio: 0.26%

=== Readability Analysis ===
avg_word_length: 3.50
syllables_per_word: 0.00


In [7]:
# Define paths to the target directories as a list
base_paths = ['data/youtube/youtube_downloads/Carry/', 'data/youtube/youtube_downloads/BeerBicep/']

# Initialize an empty list to hold extracted texts
extracted_texts = []

def contains_devanagari(text):
    # Devanagari Unicode range: 0900-097F
    for char in text:
        if '\u0900' <= char <= '\u097F':
            return True
    return False

def read_srt_files(base_paths):
    for base_path in base_paths:
        for root, dirs, files in os.walk(base_path):
            for file in files:
                if file.endswith('.hi.srt'):
                    with open(os.path.join(root, file), 'r', encoding='utf-8') as srt_file:
                        srt_content = srt_file.read()
                        if contains_devanagari(srt_content):
                            text = extract_srt_text(srt_content)
                            extracted_texts.append(text)
    return extracted_texts

# Execute the function
result = read_srt_files(base_paths)

In [8]:
flat_result = [x for row in result for x in row]

In [11]:
with open('hindi_sentences.txt', 'w', encoding='utf-8') as file:
    for sentence in flat_result:
        file.write(sentence + '\n')

In [4]:
with open('hindi_sentences.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        # Remove newline characters
        lines = [line.strip() for line in lines]

In [5]:
def main():
    analyzer = TextAnalyzer(lines)
    
    # Perform analyses
    word_freq = analyzer.word_frequency_analysis()
    sent_stats = analyzer.sentence_analysis()
    lang_mix = analyzer.language_mix_analysis()
    readability = analyzer.readability_analysis()
    
    # Print results
    print("\n=== Word Frequency Analysis ===")
    print(f"Total words: {word_freq['total_words']}")
    print(f"Unique words: {word_freq['unique_words']}")

    total_common = 0
    print("\nMost common words:")
    for word, freq in word_freq['most_common'][:100]:
        total_common += freq
        print(f"{word}: {freq}")
        
    print("\n=== Sentence Analysis ===")
    for key, value in sent_stats.items():
        print(f"{key}: {value:.2f}")
        
    print("\n=== Language Mix Analysis ===")
    print(f"Hindi ratio: {lang_mix['hindi_ratio']:.2%}")
    print(f"English ratio: {lang_mix['english_ratio']:.2%}")
    
    print("\n=== Readability Analysis ===")
    for key, value in readability.items():
        print(f"{key}: {value:.2f}")

if __name__ == "__main__":
    main()


=== Word Frequency Analysis ===
Total words: 480684
Unique words: 3189

Most common words:
और: 36461
एक: 28127
आप: 24860
आई: 21702
कर: 17258
द: 14506
हम: 14134
पर: 14005
अ: 13340
यह: 12875
अगर: 11424
ऑफ: 10513
मतलब: 10413
जब: 9123
बट: 8645
इज: 8612
इन: 7699
आ: 6388
इट: 6305
उस: 6279
अब: 5998
सब: 5582
इस: 5409
आर: 4743
आज: 4407
हर: 4322
तक: 3727
गए: 3496
वह: 3351
कम: 3084
एम: 3072
सर: 2957
य: 2912
एज: 2692
वन: 2656
गई: 2638
व: 2637
प: 2623
ऑन: 2530
तरह: 2394
तब: 2303
वजह: 2204
बन: 2155
चल: 2126
इ: 2049
ऊपर: 2022
अलग: 1983
बस: 1974
10: 1894
घर: 1862
लग: 1729
एट: 1725
कह: 1682
आउट: 1625
इफ: 1539
आए: 1536
ऑल: 1518
सम: 1451
समझ: 1422
जगह: 1402
ट: 1400
अप: 1347
उन: 1291
कई: 1285
र: 1244
न: 1194
ए: 1155
एन: 1153
ब: 1129
100: 1097
20: 1058
मत: 1013
डर: 986
लव: 955
एस: 945
ल: 927
एकदम: 923
यस: 903
सच: 865
गलत: 859
मच: 845
इवन: 803
30: 776
समय: 766
ओवर: 716
आजकल: 711
कल: 703
50: 671
अदर: 669
बचपन: 654
ओ: 652
तरफ: 620
भर: 613
स: 606
रह: 596
15: 586
बदल: 580
वर: 564
क: 560
एआई: 553

=== Sentence 