#Preparing data

In [13]:
pip install datasets



In [14]:
import pandas as pd
import requests
from io import StringIO
from datasets import load_dataset

In [15]:
def download_csv(url, sep=",", header=0):
    """
    Downloads a CSV (or TSV) from a given URL and returns a pandas DataFrame.
    """
    response = requests.get(url)
    response.raise_for_status()
    csv_data = StringIO(response.text)
    return pd.read_csv(csv_data, sep=sep, header=header)


In [16]:
# @inproceedings{chen-etal-2020-low-resource,
#     title={Low-Resource Domain Adaptation for Compositional Task-Oriented
#         Semantic Parsing},
#     author={Xilun Chen and Asish Ghoshal and Yashar Mehdad and Luke Zettlemoyer
#         and Sonal Gupta},
#     booktitle={Proceedings of the 2020 Conference on Empirical Methods in
#         Natural Language Processing (EMNLP)},
#     year={2020},
#     publisher = "Association for Computational Linguistics"
# }
def load_topv2(url):
    """
    'напомняне' (reminder); 'събитие' (event)
    """
    splits = {
        'train': 'data/train-00000-of-00001-4f5cf905029cbf9d.parquet',
        'test': 'data/test-00000-of-00001-deac2888ce8ad39d.parquet',
        'eval': 'data/eval-00000-of-00001-3ffa52405fac46ab.parquet'
    }
    full_path = 'hf://datasets/WillHeld/top_v2/' + splits['train']
    df = pd.read_parquet(full_path)

    def map_domain(domain):
        domain = str(domain).strip().lower()
        if domain == "reminder":
            return "напомняне"
        elif domain == "event":
            return "събитие"
        else:
            return None

    df['label'] = df['domain'].apply(map_domain)
    df = df.dropna(subset=['label'])
    df['text'] = df['utterance']
    return df[['text', 'label']]

In [17]:
# @article{jauhar2021mslatte,
#       title={MS-LaTTE: A Dataset of Where and When To-do Tasks are Completed},
#       author={Jauhar, Sujay Kumar and Chandrasekaran, Nirupama and Gamon, Michael and White, Ryen W.},
#       journal={arXiv preprint 2111.06902},
#       year={2021}
# }
def load_mslatte(json):
    """
    task to "задача".
    """
    df = pd.read_json(json)
    df['label'] = "задача"
    df = df.rename(columns={'TaskTitle': 'text'})
    return df[['text', 'label']]


In [18]:
def load_event_detection(csv):
    """
    news -> събитие.
    """
    df = pd.read_csv(csv)
    df['label'] = "събитие"
    df = df.rename(columns={'Event Name': 'text'})
    return df[['text', 'label']]

In [19]:
# @inproceedings{schler2006effects,
#     title={Effects of age and gender on blogging.},
#     author={Schler, Jonathan and Koppel, Moshe and Argamon, Shlomo and Pennebaker, James W},
#     booktitle={AAAI spring symposium: Computational approaches to analyzing weblogs},
#     volume={6},
#     pages={199--205},
#     year={2006}
# }
def load_notes(url):
    """
    note_text -> бележка.
    """
    dataset = load_dataset("barilan/blog_authorship_corpus")
    df = dataset["train"].to_pandas()
    df['label'] = "бележка"
    df = df.rename(columns={'content': 'text'})
    return df[['text', 'label']]

In [20]:
topv2_url = "https://huggingface.co/datasets/WillHeld/top_v2/resolve/main/reminder_split.tsv"
mslatte_url = '/content/drive/MyDrive/Colab Notebooks/data/MS-LaTTE.json'
event_detection_url = '/content/drive/MyDrive/Colab Notebooks/data/whats-happening-la-calendar-dataset.csv'
notes_url = "https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/main/blog_authorship_corpus.csv"

df_topv2 = load_topv2(topv2_url)
df_mslatte = load_mslatte(mslatte_url)
df_event = load_event_detection(event_detection_url)
df_notes = load_notes(notes_url)

df = pd.concat([df_topv2, df_mslatte, df_event, df_notes], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['utterance']


                                                text    label
0  That is kind of important. I've thought of tim...  бележка
1  The Japanese beetle seems to be the topic of c...  бележка
2     Bilingual Storytime/Hora de cuentos bilingï¿½e  събитие
3  I do need to fix my schedule, this whole "free...  бележка
4  Extreme timing how it all works out.  My mum a...  бележка


In [21]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['text'], inplace=True)

In [22]:
print(df['label'].value_counts())

label
бележка      616062
събитие       22289
напомняне     17285
задача         9997
Name: count, dtype: int64


In [23]:
min_count = df['label'].value_counts().min()

df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=min_count, random_state=42))

df = df.reset_index(drop=True)

print(df['label'].value_counts())

label
бележка      9997
задача       9997
напомняне    9997
събитие      9997
Name: count, dtype: int64


  df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=min_count, random_state=42))


#Prepare data for pytorch


In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(df['text']).toarray()

df['label_encoded'] = pd.Categorical(df['label']).codes
y = df['label_encoded'].values
label_mapping = dict(enumerate(pd.Categorical(df['label']).categories))

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_tfidf, y, test_size=0.15, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42, stratify=y_train_val
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

#Classifier

In [25]:
class MultinomialLogisticRegression(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MultinomialLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        logits = self.linear(x)
        return logits

In [26]:
import numpy as np

input_dim = X_train_tensor.shape[1]
num_classes = len(np.unique(y))
model = MultinomialLogisticRegression(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [27]:
def train_model(model, loader, val_X, val_y, criterion, optimizer, n_epochs=50, patience=5):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None


    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch_X.size(0)
        train_loss = total_loss / len(loader.dataset)

        model.eval()
        with torch.no_grad():
            val_logits = model(val_X)
            val_loss = criterion(val_logits, val_y).item()

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
          best_val_loss = val_loss
          best_mode_state = model.state_dict()
          epochs_no_improve = 0
        else:
          epochs_no_improve += 1

          if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    return model


In [28]:
train_model(model, train_loader, X_val_tensor, y_val_tensor, criterion, optimizer, n_epochs=30)

Epoch 1/30, Train Loss: 1.0905, Val Loss: 0.8686
Epoch 2/30, Train Loss: 0.7379, Val Loss: 0.6279
Epoch 3/30, Train Loss: 0.5509, Val Loss: 0.4881
Epoch 4/30, Train Loss: 0.4359, Val Loss: 0.3984
Epoch 5/30, Train Loss: 0.3590, Val Loss: 0.3364
Epoch 6/30, Train Loss: 0.3042, Val Loss: 0.2912
Epoch 7/30, Train Loss: 0.2632, Val Loss: 0.2570
Epoch 8/30, Train Loss: 0.2316, Val Loss: 0.2303
Epoch 9/30, Train Loss: 0.2064, Val Loss: 0.2090
Epoch 10/30, Train Loss: 0.1859, Val Loss: 0.1915
Epoch 11/30, Train Loss: 0.1690, Val Loss: 0.1772
Epoch 12/30, Train Loss: 0.1547, Val Loss: 0.1651
Epoch 13/30, Train Loss: 0.1426, Val Loss: 0.1549
Epoch 14/30, Train Loss: 0.1322, Val Loss: 0.1461
Epoch 15/30, Train Loss: 0.1232, Val Loss: 0.1387
Epoch 16/30, Train Loss: 0.1153, Val Loss: 0.1321
Epoch 17/30, Train Loss: 0.1084, Val Loss: 0.1264
Epoch 18/30, Train Loss: 0.1023, Val Loss: 0.1215
Epoch 19/30, Train Loss: 0.0968, Val Loss: 0.1171
Epoch 20/30, Train Loss: 0.0919, Val Loss: 0.1132
Epoch 21/

MultinomialLogisticRegression(
  (linear): Linear(in_features=10000, out_features=4, bias=True)
)

In [45]:
model.eval()
with torch.no_grad():
    test_logits = model(X_test_tensor)
    predicted_classes = torch.argmax(test_logits, dim=1)
    accuracy = (predicted_classes == y_test_tensor).float().mean().item()
    print("Test accuracy:", accuracy)

Test accuracy: 0.9764961004257202


torch.Size([5999])

In [47]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_test_np = y_test_tensor.cpu().numpy()
predicted_classes_np = predicted_classes.cpu().numpy()

accuracy = accuracy_score(y_test_np, predicted_classes_np)
print("Accuracy:", accuracy)

report = classification_report(y_test_np, predicted_classes_np)
print("\nClassification Report:\n", report)

conf_matrix = confusion_matrix(y_test_np, predicted_classes_np)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9764960826804467

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1500
           1       0.95      0.98      0.96      1499
           2       0.99      1.00      1.00      1500
           3       0.98      0.96      0.97      1500

    accuracy                           0.98      5999
   macro avg       0.98      0.98      0.98      5999
weighted avg       0.98      0.98      0.98      5999


Confusion Matrix:
 [[1453   34    3   10]
 [   3 1472    2   22]
 [   0    3 1495    2]
 [  12   47    3 1438]]


In [30]:
new_texts = [
    "Today was a great day, the birds were chirping",
    "Buy groceries",
    "remind me to wish Sam a happy birthday this Thursday",
    "Meeting with the team next week",
    "I need to finish my homework tomorrow"
]

x_new = vectorizer.transform(new_texts).toarray()
x_new_tensor = torch.tensor(x_new, dtype=torch.float32)

with torch.no_grad():
  logits = model(x_new_tensor)
  probabilities = F.softmax(logits, dim=1)
  predicted_classes = torch.argmax(logits, dim=1)

for text, predicted_class, prob in zip(new_texts, predicted_classes, probabilities):
    confidence = prob[predicted_class.item()].item()
    print(f"Text: {text}")
    print(f"Predicted Class: {label_mapping[predicted_class.item()]}")
    print(f"Confidence: {confidence}")
    print("\n")

Text: Today was a great day, the birds were chirping
Predicted Class: бележка
Confidence: 0.9337354898452759


Text: Buy groceries
Predicted Class: задача
Confidence: 0.9831216335296631


Text: remind me to wish Sam a happy birthday this Thursday
Predicted Class: напомняне
Confidence: 0.986918032169342


Text: Meeting with the team next week
Predicted Class: събитие
Confidence: 0.42145293951034546


Text: I need to finish my homework tomorrow
Predicted Class: напомняне
Confidence: 0.8711926937103271




#Translator


In [31]:
pip install openai-whisper



In [32]:
import whisper

audio_model = whisper.load_model("base")

  checkpoint = torch.load(fp, map_location=device)


In [33]:
def transcribe_and_classify(audio_path, whisper_model, vectorizer, model, language="en"):
    result = whisper_model.transcribe(audio_path, language=language)
    transcribed_text = result["text"]
    print("Transcribed text:")
    print(transcribed_text)

    X_new = vectorizer.transform([transcribed_text]).toarray()
    X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        logits = model(X_new_tensor)
        predicted_class = torch.argmax(logits, dim=1).item()

    predicted_label = label_mapping[predicted_class]
    print("Classification:", predicted_label)
    return transcribed_text, predicted_label

In [34]:
audio_file = '/content/drive/MyDrive/Colab Notebooks/data/recording.m4a'
transcribe_and_classify(audio_file, audio_model, vectorizer, model)

Transcribed text:
 Mind me to clean the dishes tomorrow.
Classification: напомняне


(' Mind me to clean the dishes tomorrow.', 'напомняне')

In [35]:
audio_file = '/content/drive/MyDrive/Colab Notebooks/data/AIRecording2.m4a'
transcribe_and_classify(audio_file, audio_model, vectorizer, model)

Transcribed text:
 Hangout with friends this Saturday.
Classification: събитие


(' Hangout with friends this Saturday.', 'събитие')

In [36]:
audio_file = '/content/drive/MyDrive/Colab Notebooks/data/AIRecording3.m4a'
transcribe_and_classify(audio_file, audio_model, vectorizer, model)

Transcribed text:
 Today I made with friends and we had a great time. It was a pleasant experience. I would do it again.
Classification: бележка


(' Today I made with friends and we had a great time. It was a pleasant experience. I would do it again.',
 'бележка')

In [37]:
import joblib
import torch

# --- Save the TF-IDF vectorizer and label mapping using joblib ---
joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(label_mapping, "label_mapping.joblib")
print("Vectorizer and label mapping saved.")

# --- Save the PyTorch model's state dictionary ---
torch.save(model.state_dict(), "classifier_model.pt")
print("Classifier model saved.")

Vectorizer and label mapping saved.
Classifier model saved.


#Server