# Emotion Recognition Prototype — Final (Colab-ready)
This final notebook adds **Back Translation** (EN→FR→EN) and **Paraphrasing** (T5) to the previous prototype.

**How to use:** Run cells top-to-bottom. When you reach the upload cell, upload `Dataset.csv`.
Note: The transformer models will be downloaded into Colab on first run (may take a few minutes).

In [None]:
# Install packages (runs in Colab)
!pip install -q nltk scikit-learn matplotlib seaborn ipywidgets xgboost transformers datasets evaluate torch sentencepiece --upgrade
print('Packages installed')


In [None]:
import pandas as pd, numpy as np, nltk, string, random, time
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import ipywidgets as widgets
from IPython.display import display, clear_output
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
print('NLTK ready')


In [None]:
# Upload your dataset (CSV/XLSX). Run this cell and choose file when 'Choose Files' appears.
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
    dataset_path = fn
print('Uploaded file:', dataset_path)
df = pd.read_csv(dataset_path)
print('Preview:')
display(df.head())
print('\nColumns:', list(df.columns))


---
## Automatic suggestions for preprocessing
Run the next cell to see quick recommendations based on your dataset content.

In [None]:
def suggest_preprocessing(df):
    suggestions = set()
    text_cols = [c for c in df.columns if df[c].dtype == object]
    if not text_cols:
        return ['No text columns found']
    txt = df[text_cols[0]].astype(str).str.cat(sep=' ')
    if any(ch in string.punctuation for ch in txt):
        suggestions.add('remove_punctuation')
    if any(char.isdigit() for char in txt):
        suggestions.add('remove_numbers')
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(txt.lower())
    stop_ratio = sum(1 for t in tokens if t in sw) / max(1, len(tokens))
    if stop_ratio > 0.2:
        suggestions.add('remove_stopwords')
    if '  ' in txt:
        suggestions.add('remove_extra_whitespace')
    return list(suggestions)
print('Auto-suggestions based on dataset:')
print(suggest_preprocessing(df))


---
## Preprocessing functions (lowercase, punctuation, numbers, stopwords, whitespace)

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import re
def lowercase(text):
    return text.lower() if isinstance(text, str) else text
def remove_punctuation(text):
    if not isinstance(text, str): return text
    return text.translate(str.maketrans('', '', string.punctuation))
def remove_numbers(text):
    if not isinstance(text, str): return text
    return re.sub(r'\d+', ' ', text)
def remove_extra_whitespace(text):
    if not isinstance(text, str): return text
    return ' '.join(text.split())
def remove_stopwords(text):
    if not isinstance(text, str): return text
    tokens = word_tokenize(text)
    return ' '.join([t for t in tokens if t.lower() not in stop_words])
def apply_preprocessing(text, steps):
    funcs = {
        'lowercase': lowercase,
        'remove_punctuation': remove_punctuation,
        'remove_numbers': remove_numbers,
        'remove_extra_whitespace': remove_extra_whitespace,
        'remove_stopwords': remove_stopwords
    }
    out = text
    for s in steps:
        f = funcs.get(s)
        if f:
            out = f(out)
    return out
print('Preprocessing functions ready')


---
## Augmentation functions (synonym replacement, random insertion/swap/deletion, sentence shuffle/truncate, noise injection)

In [None]:
from nltk.corpus import wordnet
def get_synonyms(word):
    syns = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            w = l.name().replace('_',' ')
            if w.lower() != word.lower():
                syns.add(w)
    return list(syns)
from nltk.tokenize import word_tokenize
def synonym_replacement(sentence, n=1):
    words = word_tokenize(sentence)
    eligible = [w for w in words if w.isalpha()]
    random.shuffle(eligible)
    num_replaced = 0
    for w in eligible:
        syns = get_synonyms(w)
        if syns:
            sentence = sentence.replace(w, random.choice(syns), 1)
            num_replaced += 1
        if num_replaced >= n:
            break
    return sentence
def random_deletion(sentence, p=0.1):
    words = word_tokenize(sentence)
    if len(words) == 1:
        return sentence
    remaining = [w for w in words if random.random() > p]
    if len(remaining) == 0:
        return random.choice(words)
    return ' '.join(remaining)
def random_swap(sentence, n=1):
    words = word_tokenize(sentence)
    if len(words) < 2: return sentence
    for _ in range(n):
        i, j = random.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
    return ' '.join(words)
def random_insertion(sentence, n=1):
    words = word_tokenize(sentence)
    for _ in range(n):
        new_word = None
        counter = 0
        while new_word is None and counter < 10:
            word = random.choice(words)
            syns = get_synonyms(word)
            if syns:
                new_word = random.choice(syns)
            counter += 1
        if new_word:
            pos = random.randint(0, len(words))
            words.insert(pos, new_word)
    return ' '.join(words)
def sentence_shuffle(text):
    sents = nltk.sent_tokenize(text)
    random.shuffle(sents)
    return ' '.join(sents)
def sentence_truncate(text, keep_ratio=0.7):
    sents = nltk.sent_tokenize(text)
    k = max(1, int(len(sents)*keep_ratio))
    return ' '.join(sents[:k])
def noise_injection(text, p=0.05):
    chars = list(text)
    for i in range(len(chars)):
        if random.random() < p:
            chars[i] = random.choice(string.ascii_letters)
    return ''.join(chars)
print('Augmentation functions ready')


---
## Back-translation (English -> French -> English) and Paraphrasing (T5)
These will download transformer models on first run. They may take 1–3 minutes to download depending on connection.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
print('Loading MarianMT models for back-translation...')
start_time = time.time()
tokenizer_en_fr = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
model_en_fr = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
tokenizer_fr_en = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fr-en')
model_fr_en = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-fr-en')
print('Loading T5 model for paraphrasing...')
t5_tokenizer = AutoTokenizer.from_pretrained('t5-small')
t5_model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
print('Models loaded in', round(time.time()-start_time,1), 's')


In [None]:
def back_translate(text, src_to_tgt_tokenizer, src_to_tgt_model, tgt_to_src_tokenizer, tgt_to_src_model, max_length=512):
    # EN -> FR
    inputs = src_to_tgt_tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
    outs = src_to_tgt_model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
    mid = src_to_tgt_tokenizer.decode(outs[0], skip_special_tokens=True)
    # FR -> EN
    inputs2 = tgt_to_src_tokenizer.encode(mid, return_tensors='pt', truncation=True, max_length=512)
    outs2 = tgt_to_src_model.generate(inputs2, max_length=512, num_beams=4, early_stopping=True)
    back = tgt_to_src_tokenizer.decode(outs2[0], skip_special_tokens=True)
    return back

def paraphrase_t5(text, num_return_sequences=1, max_length=256):
    # T5 paraphrase prefix
    input_text = 'paraphrase: ' + text + ' </s>'
    encoding = t5_tokenizer.encode_plus(input_text, return_tensors='pt', max_length=512, truncation=True)
    outputs = t5_model.generate(encoding['input_ids'], attention_mask=encoding['attention_mask'],
                                 max_length=max_length, num_beams=4, num_return_sequences=num_return_sequences, early_stopping=True)
    paraphrases = [t5_tokenizer.decode(o, skip_special_tokens=True, clean_up_tokenization_spaces=True) for o in outputs]
    return paraphrases[0] if paraphrases else text

print('Back-translation & paraphrase functions ready')


---
## UI: choose text & label columns, suggest preprocessing, pick preprocessing/augmentation including back-translation/paraphrasing

In [None]:
text_col_dropdown = widgets.Dropdown(options=list(df.columns), description='Text col:')
label_col_dropdown = widgets.Dropdown(options=list(df.columns), description='Label col:')
suggest_btn = widgets.Button(description='Suggest Preprocessing')
apply_prep_btn = widgets.Button(description='Apply Preprocessing')
aug_choice = widgets.SelectMultiple(options=['synonym_replacement','random_insertion','random_swap','random_deletion','sentence_shuffle','sentence_truncate','noise_injection','back_translation','paraphrase_t5'],
                                     description='Augment:')
augment_btn = widgets.Button(description='Apply Augmentation (example)')
out_ui = widgets.Output()
lowercase_chk = widgets.Checkbox(value=True, description='Lowercase')
punct_chk = widgets.Checkbox(value=True, description='Remove Punctuation')
num_chk = widgets.Checkbox(value=True, description='Remove Numbers')
stop_chk = widgets.Checkbox(value=True, description='Remove Stopwords')
ws_chk = widgets.Checkbox(value=True, description='Extra Whitespace')

def on_suggest(b):
    with out_ui:
        clear_output()
        s = suggest_preprocessing(df)
        print('Suggested steps:', s)
suggest_btn.on_click(on_suggest)

def on_apply_prep(b):
    with out_ui:
        clear_output()
        tcol = text_col_dropdown.value
        steps = []
        if lowercase_chk.value: steps.append('lowercase')
        if punct_chk.value: steps.append('remove_punctuation')
        if num_chk.value: steps.append('remove_numbers')
        if stop_chk.value: steps.append('remove_stopwords')
        if ws_chk.value: steps.append('remove_extra_whitespace')
        df['clean_text'] = df[tcol].astype(str).apply(lambda x: apply_preprocessing(x, steps))
        print('Preprocessing applied to column', tcol)
        display(df[[tcol,'clean_text']].head())
apply_prep_btn.on_click(on_apply_prep)

def on_augment(b):
    with out_ui:
        clear_output()
        tcol = text_col_dropdown.value
        choices = list(aug_choice.value)
        # Apply examples and add columns
        if 'synonym_replacement' in choices:
            df['aug_syn'] = df[tcol].astype(str).apply(lambda x: synonym_replacement(x, n=1))
        if 'random_insertion' in choices:
            df['aug_ins'] = df[tcol].astype(str).apply(lambda x: random_insertion(x, n=1))
        if 'random_swap' in choices:
            df['aug_swap'] = df[tcol].astype(str).apply(lambda x: random_swap(x, n=1))
        if 'random_deletion' in choices:
            df['aug_del'] = df[tcol].astype(str).apply(lambda x: random_deletion(x, p=0.1))
        if 'sentence_shuffle' in choices:
            df['aug_shuf'] = df[tcol].astype(str).apply(lambda x: sentence_shuffle(x))
        if 'sentence_truncate' in choices:
            df['aug_trunc'] = df[tcol].astype(str).apply(lambda x: sentence_truncate(x, keep_ratio=0.7))
        if 'noise_injection' in choices:
            df['aug_noise'] = df[tcol].astype(str).apply(lambda x: noise_injection(x, p=0.03))
        if 'back_translation' in choices:
            # back-translate first 50 rows to avoid long runtime in demo
            df['aug_bt'] = df[tcol].astype(str).head(50).apply(lambda x: back_translate(x, tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en))
        if 'paraphrase_t5' in choices:
            df['aug_para'] = df[tcol].astype(str).head(200).apply(lambda x: paraphrase_t5(x, num_return_sequences=1))
        print('Augmentation columns added (examples).')
        display(df.head())
augment_btn.on_click(on_augment)

display(text_col_dropdown, label_col_dropdown)
display(widgets.HBox([lowercase_chk, punct_chk, num_chk, stop_chk, ws_chk]))
display(suggest_btn, apply_prep_btn)
display(widgets.HTML('<b>Choose augmentations (these create example augmented columns):</b>'))
display(aug_choice, augment_btn, out_ui)


---
## Training and Evaluation
Choose a model and click Train. DistilBERT option will fine-tune a transformer and can take longer.

In [None]:
def get_X_y():
    tcol = text_col_dropdown.value
    lcol = label_col_dropdown.value
    if 'clean_text' in df.columns:
        X = df['clean_text'].astype(str)
    else:
        X = df[tcol].astype(str)
    y = df[lcol].astype(str)
    return X, y

model_options = ['Logistic Regression','Naive Bayes','SVM','XGBoost','DistilBERT (transformer)']
model_dropdown = widgets.Dropdown(options=model_options, description='Model:')
train_btn2 = widgets.Button(description='Train Model')
out_train = widgets.Output()

def train_classic_model(model_name, X_train, X_test, y_train, y_test):
    vect = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
    Xtr = vect.fit_transform(X_train)
    Xte = vect.transform(X_test)
    if model_name == 'Logistic Regression':
        model = LogisticRegression(max_iter=1000)
    elif model_name == 'Naive Bayes':
        model = MultinomialNB()
    elif model_name == 'SVM':
        model = LinearSVC()
    else:
        from xgboost import XGBClassifier
        model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(Xtr, y_train)
    y_pred = model.predict(Xte)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.show()
    return model, vect

def train_transformer(X_train, X_test, y_train, y_test, epochs=2):
    from datasets import Dataset
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    labels = sorted(list(set(y_train.tolist() + y_test.tolist())))
    label2id = {l:i for i,l in enumerate(labels)}
    def map_label(x):
        return label2id[x]
    train_ds = Dataset.from_dict({'text': X_train.tolist(), 'label':[label2id[l] for l in y_train.tolist()]})
    test_ds = Dataset.from_dict({'text': X_test.tolist(), 'label':[label2id[l] for l in y_test.tolist()]})
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    def tokenize(batch): return tokenizer(batch['text'], truncation=True, padding=True, max_length=128)
    train_ds = train_ds.map(tokenize, batched=True)
    test_ds = test_ds.map(tokenize, batched=True)
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(labels))
    training_args = TrainingArguments(output_dir='./results', per_device_train_batch_size=16, per_device_eval_batch_size=32, num_train_epochs=epochs, logging_steps=50, evaluation_strategy='epoch')
    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        from sklearn.metrics import accuracy_score, precision_recall_fscore_support
        acc = accuracy_score(p.label_ids, preds)
        prf = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
        return {'accuracy': acc, 'precision': prf[0], 'recall': prf[1], 'f1': prf[2]}
    trainer = Trainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer, compute_metrics=compute_metrics)
    trainer.train()
    eval_res = trainer.evaluate()
    print(eval_res)

def on_train_clicked2(b):
    with out_train:
        clear_output()
        X, y = get_X_y()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        choice = model_dropdown.value
        print('Training model:', choice)
        if choice == 'DistilBERT (transformer)':
            print('This will download transformer weights and may take time. Use small dataset for quick run.')
            train_transformer(X_train, X_test, y_train, y_test, epochs=2)
        else:
            model, vect = train_classic_model(choice, X_train, X_test, y_train, y_test)
            print('Done.')

train_btn2.on_click(on_train_clicked2)
display(model_dropdown, train_btn2, out_train)


---
**Final notes:** Transformer-based back-translation and paraphrasing will download models on first run — please be patient. If the dataset is large, use `df = df.sample(2000)` to test faster.