In [2]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import spacy
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load spaCy model for sentence segmentation
nlp = spacy.load("en_core_web_sm")

# Load Emotion Analysis Model
emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
emotion_model.eval()

# Load Sentiment Analysis Model
sentiment_model_name = "siebert/sentiment-roberta-large-english"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [23]:
# Define labels
EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
SENTIMENT_LABELS = ["Negative", "Positive"]

In [24]:
def chunk_text(text):
    """Split text into meaningful sentence chunks."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

In [25]:
def get_emotion_scores(text):
    """Get emotion distribution for a given text chunk."""
    if not text.strip():
        return {label: 0.0 for label in EMOTION_LABELS}
    
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = emotion_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    return {label: probs[i] for i, label in enumerate(EMOTION_LABELS)}

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [26]:
def get_sentiment_scores(text):
    if not isinstance(text, str) or text.strip() == "":
        return {label: 0.0 for label in SENTIMENT_LABELS}

    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()
    return {label: probs[i] for i, label in enumerate(SENTIMENT_LABELS)}

In [30]:
def process_diary_entry(entry):
    """Process a diary entry: chunk text, analyze sentiment & emotion."""
    chunks = chunk_text(entry)
    processed_data = []
    
    for chunk in chunks:
        emotion_scores = get_emotion_scores(chunk)
        sentiment_scores = get_sentiment_scores(chunk)
        
        processed_data.append({
            "text": chunk,
            **emotion_scores,
            **sentiment_scores
        })
    
    return processed_data

In [31]:
def process_csv(file_path, output_path):
    """Process diary CSV and save structured data."""
    df = pd.read_csv(file_path)
    processed_entries = []
    
    for _, row in df.iterrows():
        date = row["Date"]
        entry = row["Entry"]
        processed_chunks = process_diary_entry(entry)
        
        for chunk_data in processed_chunks:
            chunk_data["date"] = date
            processed_entries.append(chunk_data)
    
    output_df = pd.DataFrame(processed_entries)
    output_df.to_csv(output_path, index=False)
    return output_df

In [32]:
# Example Usage
input_csv = "/Users/pandhari/ai-diary-project/Data/diary_dataset.csv"
output_csv = "diary_with_emotions_and_sentiments.csv"
result_df = process_csv(input_csv, output_csv)

# Display sample output
print(result_df.head(0))

Empty DataFrame
Columns: [text, anger, disgust, fear, joy, sadness, surprise, neutral, Negative, Positive, date]
Index: []


In [33]:
result_df.head(5)

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise,neutral,Negative,Positive,date
0,The day started like any other.,0.025992,0.017581,0.798574,0.017926,0.079598,0.008216,0.052114,0.002337,0.997663,2024-04-01 00:00:00
1,"Woke up, had my usual coffee, and went about m...",0.011332,0.033233,0.008014,0.133408,0.771033,0.02592,0.01706,0.001774,0.998226,2024-04-01 00:00:00
2,Work was neither too stressful nor too excitin...,0.012116,0.005369,0.018797,0.485094,0.152819,0.223532,0.102273,0.003375,0.996625,2024-04-01 00:00:00
3,"Even the evening felt routine, with dinner, a ...",0.00488,0.037629,0.002696,0.143821,0.780782,0.025199,0.004993,0.998072,0.001928,2024-04-01 00:00:00
4,Met an old friend unexpectedly at a cafe.,0.008115,0.005831,0.079877,0.496256,0.006157,0.006609,0.397155,0.001199,0.998801,2024-04-02 00:00:00
