In [1]:
import nltk
import pandas as pd
from nltk.probability import FreqDist
from collections import defaultdict


In [12]:
def generate_ngrams(text, n):
    """Generate n-grams from a given text."""
    return [text[i:i+n] for i in range(len(text) - n + 1)]

def analyze_ngrams_per_language(tsv_file, n):
    """Compute unique n-grams and most frequent ones per language from a TSV file."""
    df = pd.read_csv(tsv_file, sep='\t', names=['language', 'text'])
    language_ngrams = defaultdict(FreqDist)
    
    for  _,row in df.iterrows():
        ngrams = generate_ngrams(row['text'], n)
        language_ngrams[row['language']].update(ngrams)
    
    results = {}
    for language, fdist in language_ngrams.items():
        unique_ngrams = len(fdist)
        most_common_ngrams = fdist.most_common(20)
        results[language] = {
            "unique_ngrams": unique_ngrams,
            "most_common": most_common_ngrams,
            "len": len(ngrams)
        }
    
    return results

# Example usage
n = 3  
tsv_file = "easy-train.tsv"  
results = analyze_ngrams_per_language(tsv_file, n)


for language, data in results.items():
    print(f"Language: {language}")
    print(f"Unique {n}-grams: {data['unique_ngrams']}")
    print(f"Top 20 {n}-grams: {data['most_common']}")
    print(data['len'])
    print("-" * 40)
    



Language: spa
Unique 3-grams: 8209
Top 20 3-grams: [(' de', 3707), (' es', 3270), ('de ', 2902), ('os ', 2631), (' qu', 2628), ('es ', 2381), ('que', 2334), ('la ', 2321), (' la', 2310), ('ue ', 2202), (' co', 2038), ('as ', 1882), ('est', 1840), ('do ', 1817), ('en ', 1750), (' un', 1732), (' en', 1728), ('el ', 1679), (' a ', 1581), ('ent', 1544)]
38
----------------------------------------
Language: deu
Unique 3-grams: 10094
Top 20 3-grams: [('en ', 6424), ('ch ', 5118), ('ich', 5058), ('er ', 4865), ('ein', 4512), ('ie ', 3602), ('cht', 3151), ('sch', 2695), ('st ', 2678), (' de', 2664), ('en.', 2625), ('ine', 2555), ('in ', 2415), (' ei', 2293), ('che', 2072), (' di', 2057), ('ist', 1980), ('der', 1978), (' da', 1917), (' ge', 1893)]
38
----------------------------------------
Language: tur
Unique 3-grams: 8151
Top 20 3-grams: [('Tom', 2640), (' bi', 2485), ('yor', 2260), ('en ', 1914), ('bir', 1876), ('ir ', 1858), ('om ', 1769), (' ya', 1715), ('in ', 1521), (' ol', 1269), ('lar

In [23]:
import nltk
import pandas as pd
import numpy as np
from nltk.probability import FreqDist
from collections import defaultdict

def generate_ngrams(text, n):
    """Generate n-grams from a given text."""
    return [text[i:i+n] for i in range(len(text) - n + 1)]

def train_naive_bayes(tsv_file, n):
    """Train a Naïve Bayes classifier for language detection using n-grams."""
    df = pd.read_csv(tsv_file, sep='\t', names=['language', 'text'])
    language_ngrams = defaultdict(FreqDist)
    total_ngrams = 0
    language_counts = defaultdict(int)
    
    # Process each row in the dataset
    for _, row in df.iterrows():
        ngrams = generate_ngrams(row['text'], n)
        language_ngrams[row['language']].update(ngrams)
        language_counts[row['language']] += len(ngrams)
        total_ngrams += len(ngrams)
    
    # Compute log probabilities
    log_prior = {lang: np.log(count / total_ngrams) for lang, count in language_counts.items()}
    log_likelihood = {}
    
    for lang, fdist in language_ngrams.items():
        total_lang_ngrams = sum(fdist.values())
        log_likelihood[lang] = {ngram: np.log((count + 1) / (total_lang_ngrams + len(fdist))) for ngram, count in fdist.items()}
    
    return log_prior, log_likelihood

def predict_language(text, n, log_prior, log_likelihood):
    """Predict the language of a given text using the trained Naïve Bayes model."""
    ngrams = generate_ngrams(text, n)
    scores = {}
    
    for lang in log_prior:
        scores[lang] = log_prior[lang]
        for ngram in ngrams:
            if ngram in log_likelihood[lang]:
                scores[lang] += log_likelihood[lang][ngram]
            else:
                scores[lang] += np.log(1 / (sum(log_likelihood[lang].values()) + len(log_likelihood[lang])))
    
    return max(scores, key=scores.get)

# Example usage
n = 10  # Change this for different n-gram sizes
tsv_file = "easy-train.tsv"  # Update with actual file path
log_prior, log_likelihood = train_naive_bayes(tsv_file, n)

# Test the model
test_sentence = "This is a test sentence."
predicted_language = predict_language(test_sentence, n, log_prior, log_likelihood)
print(f"Predicted language: {predicted_language}")


  scores[lang] += np.log(1 / (sum(log_likelihood[lang].values()) + len(log_likelihood[lang])))


Predicted language: spa
