In [11]:
import os
import pandas as pd
import spacy
import openpyxl


#Google
from langdetect import detect, detect_langs,DetectorFactory
# DetectorFactory.seed = 1  # Optional, for consistency
#Spacy
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
#fasttext
import fasttext
#Roberta
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize



In [2]:
#fastext model
# Load the FastText pre-trained language identification model
model_path_fasttext = 'C:/Python311/fastText/lid.176.bin'
model_fasttext = fasttext.load_model(model_path_fasttext)
# Load SpaCy model
nlp = spacy.load('en_core_web_sm')
# Load the model and tokenizer
model_roberta = "papluca/xlm-roberta-base-language-detection"
tokenizer_roberta = AutoTokenizer.from_pretrained(model_roberta)
modelRoberta = AutoModelForSequenceClassification.from_pretrained(model_roberta)




In [18]:
def save_to_excel(df, path, sheet_name):
    with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

def classify_language_type_roberta(text):
    sentences = text.split('.')
    detected_languages = set()
    #num_elements=0
    detected_mix = False  # Flag to indicate if 'mix' should be returned

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            # Tokenize and classify the sentence using the model
            inputs = tokenizer_roberta(sentence, padding=True, truncation=True, return_tensors="pt")

            with torch.no_grad():
                logits = modelRoberta(**inputs).logits

            preds = torch.softmax(logits, dim=-1)

            # Map raw predictions to languages
            id2lang = modelRoberta.config.id2label
            vals, idxs = torch.max(preds, dim=1)

            # Get the predicted language and add it to detected_languages
            for k in idxs:
                detected_languages.add(id2lang[k.item()])
                #num_elements=num_elements+1
                #print(detected_languages)

#     print(f"Detected languages: {detected_languages}")  # Debugging information
    if len(detected_languages) > 1:
        return 'mix'
    elif len(detected_languages) == 1:
        return detected_languages.pop()  # Return the single language in the set
    else:
        return 'unknown'  # Handle the case where no language is detected
    
def detect_language_nltk(text):
    if not text:
        return "unknown"#, 0.0
    try:
        langs = detect_langs(text)
        if len(langs) > 1:
            return "mix"#, 1.0  # Consider it mixed if multiple languages are detected
        else:
            lang = langs[0].lang
            return lang#, langs[0].prob
    except:
        return "unknown"#, 0.0
    

    
# Function to create the language detector component
def create_language_detector(nlp, name):
    return LanguageDetector()
# Try to register the language detector with SpaCy
try:
    Language.factory("language_detector", func=create_language_detector)
    if "language_detector" not in nlp.pipe_names:
        nlp.add_pipe('language_detector', last=True)
except ValueError:
    pass  # The component is already registered


def detect_language_spacy(text):
    if not text:
        return "unknown"
    
    # Split text into sentences or paragraphs
    sentences = text.split('. ')
    detected_languages = set()
    
    for sentence in sentences:
        doc = nlp(sentence)
        lang = doc._.language['language']
        detected_languages.add(lang)
    
    # Aggregate results
    if len(detected_languages) > 1:
        return 'mix'
    elif len(detected_languages) == 1:
        return detected_languages.pop()  # Return the single language in the set
    else:
        return 'unknown'


def detect_language_fasttext(text):
    if not text:
        return "unknown"#, 0.0
    
    predictions = model_fasttext.predict([text], k=2)  # Get top 2 predictions
    top_label = predictions[0][0][0].replace("__label__", "")
    top_probability = predictions[1][0][0]
    
    # Check if top prediction has high confidence and no other prediction with significant probability
    if top_probability > 0.95 and len(predictions[0]) == 1:
        return top_label #, top_probability
    else:
        return "mix"#, 1.0
    
# Load the Excel file
file_name = "SubjectAppsDataset_exp03.xlsx"
file_path = file_name
# Read the Excel file
xls = pd.ExcelFile(file_path)
# Read the specific sheet you want to analyze
sheet_name = "Sheet1"
filtered_df = pd.read_excel(xls, sheet_name=sheet_name)
# Count the number of records in the filtered DataFrame
record_count = len(filtered_df)
print(f"Number of records in the filtered DataFrame: {record_count}")

# Now you can iterate over the filtered DataFrame
for index, row in filtered_df.iterrows():    
    review = row['Normalization']
    
    # Handle NaN values or empty strings
    if isinstance(review, float) and pd.isna(review):
        review = ""
    
    if review == "":
        max_label = "unknown"
        max_probability = 0.0
    else:
        nltk_lang = detect_language_nltk(review)
        spacy_lang = detect_language_spacy(review)
        fasttext_lang = detect_language_fasttext(review)
        roberta_lang = classify_language_type_roberta(review)
        
    # Update the filtered DataFrame
    filtered_df.at[index, 'Exp_Language_Detection[NLTK]'] = nltk_lang
    filtered_df.at[index, 'Exp_Language_Detection[Spacy]'] = spacy_lang
    filtered_df.at[index, 'Exp_Language_Detection[FastText]'] = fasttext_lang
    filtered_df.at[index, 'Exp_Language_Detection[Roberta]'] = roberta_lang

Number of records in the filtered DataFrame: 2621


In [19]:
sheet_name="baseline"
save_to_excel(filtered_df, file_path, sheet_name)
# --- Final Status ---
print(f"\n✅ File saved to '{file_path}' in sheet '{sheet_name}'.")
xls.close()


✅ File saved to 'SubjectAppsDataset_exp03.xlsx' in sheet 'baseline'.


In [99]:
sentence="""
this app contain bug it makes my fullscreen navigation system unusable whenever i use this app i cannot exit or use back gesture and i will stick in the app or sometimes if the app runs on background will also disable back and exit gesture it almost like the system detect myjpj app as a launcher please fix la bebal menyusahkan orang je la dah la lesen physical tak bagi bengap betul tak pasal2 aku kena restart fon bagai sebab bug bodo ni tiba2 act sebagai launcher pulak tak nyaman la macam ni.
"""

nltkPreditc=detect_language_nltk(sentence)
print(nltkPreditc)

en
