# Code-Mixed Language Detection using LLMs

This notebook demonstrates zero-shot language identification for code-mixed text
(e.g., Hindi–English) using a Large Language Model.


## Observations

- The LLM correctly identifies multiple languages in code-mixed sentences.
- Romanized Hindi words are recognized without explicit training.
- Zero-shot prompting works effectively for language identification.


#new

In [51]:
!pip install openai pandas tqdm scikit-learn




In [52]:
!pip install transformers datasets torch scikit-learn pandas tqdm




In [53]:
# import pandas as pd
# from tqdm import tqdm
# from openai import OpenAI
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report


In [54]:
data = [
    ("मैं school जा रहा हूँ", ["HI","EN","HI","HI","HI"]),
    ("आज meeting थी boss के साथ", ["HI","EN","HI","EN","HI","HI"]),
    ("kal exam hai", ["HI","EN","HI"]),
    ("party amazing थी", ["EN","EN","HI"]),
    ("tum kaha ho", ["HI","HI","HI"])
]

df = pd.DataFrame(data, columns=["sentence", "labels"])
df


Unnamed: 0,sentence,labels
0,मैं school जा रहा हूँ,"[HI, EN, HI, HI, HI]"
1,आज meeting थी boss के साथ,"[HI, EN, HI, EN, HI, HI]"
2,kal exam hai,"[HI, EN, HI]"
3,party amazing थी,"[EN, EN, HI]"
4,tum kaha ho,"[HI, HI, HI]"


In [55]:
def tokenize(sentence):
    return sentence.split()

df["tokens"] = df["sentence"].apply(tokenize)
df


Unnamed: 0,sentence,labels,tokens
0,मैं school जा रहा हूँ,"[HI, EN, HI, HI, HI]","[मैं, school, जा, रहा, हूँ]"
1,आज meeting थी boss के साथ,"[HI, EN, HI, EN, HI, HI]","[आज, meeting, थी, boss, के, साथ]"
2,kal exam hai,"[HI, EN, HI]","[kal, exam, hai]"
3,party amazing थी,"[EN, EN, HI]","[party, amazing, थी]"
4,tum kaha ho,"[HI, HI, HI]","[tum, kaha, ho]"


In [56]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=2
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
label2id = {"HI": 0, "EN": 1}
id2label = {0: "HI", 1: "EN"}


In [58]:
def encode_data(tokens, labels):
    encodings = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    word_ids = encodings.word_ids()
    encoded_labels = []

    for word_id in word_ids:
        if word_id is None:
            encoded_labels.append(-100)
        else:
            encoded_labels.append(label2id[labels[word_id]])

    encodings["labels"] = torch.tensor([encoded_labels])
    return encodings


In [59]:
def detect_language(sentence):
    tokens = sentence.split()

    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    word_ids = inputs.word_ids()

    result = []
    prev = None
    for idx, word_id in enumerate(word_ids):
        if word_id is None or word_id == prev:
            continue
        result.append(id2label[predictions[idx]])
        prev = word_id

    return list(zip(tokens, result))


In [60]:
print(detect_language("आज meeting थी boss के साथ"))


[('आज', 'EN'), ('meeting', 'EN'), ('थी', 'EN'), ('boss', 'EN'), ('के', 'EN'), ('साथ', 'EN')]


In [61]:
y_true = []
y_pred = []

for _, row in df.iterrows():
    prediction = detect_language(row["sentence"])
    for (_, pred), true in zip(prediction, row["labels"]):
        y_true.append(true)
        y_pred.append(pred)

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

          EN       0.30      1.00      0.46         6
          HI       0.00      0.00      0.00        14

    accuracy                           0.30        20
   macro avg       0.15      0.50      0.23        20
weighted avg       0.09      0.30      0.14        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [62]:
def run_user_input():
    print("Enter a code-mixed sentence (or type 'exit'):")
    while True:
        sentence = input(">> ")
        if sentence.lower() == "exit":
            break
        output = detect_language(sentence)
        print(output)
        print("-" * 50)

run_user_input()


Enter a code-mixed sentence (or type 'exit'):
>> hwllo im astha
[('hwllo', 'EN'), ('im', 'EN'), ('astha', 'EN')]
--------------------------------------------------
>> exit


In [63]:
import re

def rule_based_detect(sentence):
    tokens = sentence.split()
    result = []
    for token in tokens:
        if re.search(r'[\u0900-\u097F]', token):
            result.append("HI")
        else:
            result.append("EN")
    return list(zip(tokens, result))


In [64]:
print(rule_based_detect("आज meeting थी boss के साथ"))


[('आज', 'HI'), ('meeting', 'EN'), ('थी', 'HI'), ('boss', 'EN'), ('के', 'HI'), ('साथ', 'HI')]


In [65]:
y_true_b = []
y_pred_b = []

for _, row in df.iterrows():
    prediction = rule_based_detect(row["sentence"])
    for (_, pred), true in zip(prediction, row["labels"]):
        y_true_b.append(true)
        y_pred_b.append(pred)

print("RULE-BASED BASELINE")
print(classification_report(y_true_b, y_pred_b))


RULE-BASED BASELINE
              precision    recall  f1-score   support

          EN       0.55      1.00      0.71         6
          HI       1.00      0.64      0.78        14

    accuracy                           0.75        20
   macro avg       0.77      0.82      0.74        20
weighted avg       0.86      0.75      0.76        20



error analysis


In [66]:
def error_analysis():
    print("ERROR ANALYSIS\n")
    for _, row in df.iterrows():
        sentence = row["sentence"]
        gold = row["labels"]
        pred = [p for _, p in detect_language(sentence)]

        if gold != pred:
            print("Sentence:", sentence)
            print("Gold:", gold)
            print("Pred:", pred)
            print("Reason: Romanized ambiguity / no fine-tuning")
            print("-" * 60)

error_analysis()


ERROR ANALYSIS

Sentence: मैं school जा रहा हूँ
Gold: ['HI', 'EN', 'HI', 'HI', 'HI']
Pred: ['EN', 'EN', 'EN', 'EN', 'EN']
Reason: Romanized ambiguity / no fine-tuning
------------------------------------------------------------
Sentence: आज meeting थी boss के साथ
Gold: ['HI', 'EN', 'HI', 'EN', 'HI', 'HI']
Pred: ['EN', 'EN', 'EN', 'EN', 'EN', 'EN']
Reason: Romanized ambiguity / no fine-tuning
------------------------------------------------------------
Sentence: kal exam hai
Gold: ['HI', 'EN', 'HI']
Pred: ['EN', 'EN', 'EN']
Reason: Romanized ambiguity / no fine-tuning
------------------------------------------------------------
Sentence: party amazing थी
Gold: ['EN', 'EN', 'HI']
Pred: ['EN', 'EN', 'EN']
Reason: Romanized ambiguity / no fine-tuning
------------------------------------------------------------
Sentence: tum kaha ho
Gold: ['HI', 'HI', 'HI']
Pred: ['EN', 'EN', 'EN']
Reason: Romanized ambiguity / no fine-tuning
------------------------------------------------------------


In [67]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

@st.cache_resource
def load_model():
    model_name = "xlm-roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=2
    )
    return tokenizer, model

tokenizer, model = load_model()

id2label = {0: "HI", 1: "EN"}

def detect_language(sentence):
    tokens = sentence.split()

    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    word_ids = inputs.word_ids()

    result = []
    prev = None
    for idx, word_id in enumerate(word_ids):
        if word_id is None or word_id == prev:
            continue
        result.append((tokens[word_id], id2label[predictions[idx]]))
        prev = word_id

    return result

st.title("Code-Mixed Language Detection (HI–EN)")
st.write("Token-level language identification using a pretrained Transformer")

sentence = st.text_input("Enter a code-mixed sentence:")

if sentence:
    output = detect_language(sentence)
    for word, lang in output:
        st.write(f"**{word}** → {lang}")

    st.info("Model is not fine-tuned on Romanized Hindi. Errors are expected.")


Overwriting app.py


In [68]:
!ls


app.py	logs.txt  sample_data


In [69]:
!pip install streamlit transformers torch pyngrok




In [72]:
!streamlit run app.py &>/content/logs.txt &


In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("Open this URL:", public_url)


#end
