In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re
import openpyxl
from collections import Counter

In [2]:
import fasttext
model_path_fasttext = 'C:/Python311/fastText/lid.176.bin'
model = fasttext.load_model(model_path_fasttext)



In [5]:
BATCH_SIZE = 16

# Load the Excel file
file_name = "SubjectAppsDataset_exp03.xlsx"
df = pd.read_excel(file_name, sheet_name="baseline")

# Placeholder for language detection results
detect_language_sentences = []
count_language_sentences = []
language_labels = []

def classify_word(word, lang):
    if (word.startswith('x') or
        lang != 'en' or
        re.search(r'([aiou])\1{1,}$', word) or
        re.search(r'(.)\1{2,}$', word) or
        re.search(r'^(di|ber|ke|se|me|pe|ter|per|mem|men|meng|meny|menge|kan|lah|kah|nya|pun|tah|kah|kah|lah|pun|tah|nya|kan)$', word) or
        re.search(r'[aeiou]{3,}', word)):
        return 'ms'
    return lang

# Process reviews in batches
for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Batch Language Detection"):
    batch_reviews = df.iloc[i:i + BATCH_SIZE]['Normalization'].astype(str).tolist()
    batch_reviews = [re.sub(r'[^\w\s]', '', review) for review in batch_reviews]
    # Split words in each review
    batch_splitted = [review.split() for review in batch_reviews]

    # Flatten for prediction
    flat_words = [word for sublist in batch_splitted for word in sublist]
    
    # Predict all words at once
    predictions, _ = model.predict(flat_words)
    predicted_languages = [pred[0].replace('__label__', '') for pred in predictions]



    # Reconstruct predictions per review
    idx = 0
    for splitted in batch_splitted:
        word_count = len(splitted)
        lang_batch = predicted_languages[idx:idx + word_count]
        idx += word_count

        classified_languages = [classify_word(word, lang) for word, lang in zip(splitted, lang_batch)]
        result = list(zip(splitted, classified_languages))
        detect_language_sentences.append(result)

        language_counts = Counter(classified_languages)
        language_counts.setdefault('en', 0)
        language_counts.setdefault('ms', 0)
        count_language_sentences.append(language_counts)

        count_en, count_ms = language_counts['en'], language_counts['ms']     
        if count_en == 0 and count_ms == 0:
            label = 'mix'
        else:
            percentage_difference = abs(count_en - count_ms) / max(count_en, count_ms) * 100 if max(count_en, count_ms) != 0 else 0
            label = 'mix' if percentage_difference <= 50 else ('en' if count_en > count_ms else 'ms')
        language_labels.append(label)

# Add results to DataFrame
df['Exp_Language_Detection[Proposed1]'] = language_labels


Batch Language Detection: 100%|█████████████████████████████████████████████████████| 164/164 [00:00<00:00, 354.77it/s]


In [6]:
# --- Save to Excel ---
def save_to_excel(df, path, sheet_name):
    with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

sheet_name = "proposed"
save_to_excel(df, file_name, sheet_name)
# --- Final Status ---
print(f"\n✅ File saved to '{file_name}' in sheet '{sheet_name}'.")


✅ File saved to 'SubjectAppsDataset_exp03.xlsx' in sheet 'proposed'.
