##**1. Install Required Libraries**

In [60]:
!pip install spacy
!pip install transformers
!pip install PyPDF2
!pip install pdfplumber
!pip install torch
!pip install nltk
!pip install transformers torch scikit-learn
!pip install emoji
!pip install transformers torch scikit-learn optuna

# Download SpaCy model
!python -m spacy download en_core_web_sm

# Download NLTK data
import nltk
nltk.download('opinion_lexicon')


Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

## **2. Import Required Libraries**

In [61]:
import os
import re
import pdfplumber
import PyPDF2
import spacy
import emoji
import nltk
import torch
import optuna
import pandas as pd
from collections import Counter
from nltk.corpus import opinion_lexicon, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from flask import Flask, render_template, request, redirect, url_for
from werkzeug.utils import secure_filename

# Ensure NLTK data is downloaded
nltk.download('vader_lexicon')
nltk.download('opinion_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **3. Data Cleaning**
### Preprocesses text for emotion detection by performing:
*  Lowercasing
*  Removing punctuation
*  Lemmatization (converting words to their base form)
*  Removing stop words (optional, may affect emotion detection)

### Args:
      * text (str): The text to be cleaned.
      * remove_stop_words (bool, optional): Whether to remove stop words. Defaults to True.

### Returns:
       str: The cleaned text.

In [33]:
def extract_text_from_pdf(pdf_path):
  text = ""
  # Check if the file exists (optional, but recommended for robustness)
  if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

  with open(pdf_path, 'rb') as f:
    reader = PyPDF2.PdfReader(f)
    for page in range(len(reader.pages)):
      text += reader.pages[page].extract_text()  # Use reader.pages[page] instead of reader.getPage(page)
  return text

def clean_text(text, remove_stop_words=True):

  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

  # Download nltk resources if not already installed (prevent errors)
  nltk.download('punkt', quiet=True)
  nltk.download('wordnet', quiet=True)
  nltk.download('stopwords', quiet=True)

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in text.split()]

  if remove_stop_words:
    # Remove stop words (optional)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

  return " ".join(tokens)

# Example usage (replace with your actual PDF path)
pdf_path = '/content/drive/MyDrive/Dataset/test.pdf'

try:
  text = extract_text_from_pdf(pdf_path)
except FileNotFoundError as e:
  print("Error: ", e)
  exit()

cleaned_text = clean_text(text)

# Print the extracted and cleaned text
print("Extracted text:\n", text)
print("\nCleaned text:\n", cleaned_text)


Extracted text:
 The Park Bench   
The park was a haven of tranquillity  amidst the bustling city. Sunlight filtered through the 
leaves of the towering oak trees, casting dappled patterns on the grassy ground. A 
gentle breeze rustled the leaves, creating a soothing melody. On a nearby bench, an 
elderly woman sat reading a book , her face etched with a serene smile. Her wrinkled 
hands gently turned the pages, her eyes absorbing the words with quiet focus.  
A young girl skipped past, her laughter echoing through the park. Her bright yellow dress 
fluttered in the wind as she chased a playful butterfly. The woman looked up for a 
moment, a warm smile gracing her lips as she watched the girl's carefree spirit. A flock 
of birds chirped merrily, flitting from branch to branch. The air was filled with the sweet 
fragrance of blooming wildflowers.  
A sense of peace and tranquillity  settled over the scene. The worries and anxieties of the 
world seemed to fade away amidst the park's ser

### Preprocesses text for emotion detection by performing:
 - Lowercasing
 - Removing punctuation
 - Lemmatization (converting words to their base form)
 - Removing stop words (optional)
 - Handling emojis (optional) - convert to text description or remove
 - Handling informal language (optional) - replace slang, expand abbreviations

### Args:
     text (str): The text to be cleaned.
     remove_stop_words (bool, optional): Whether to remove stop words. Defaults to True.
     handle_emojis (bool, optional): Whether to handle emojis (convert or remove). Defaults to True.
     handle_informal (bool, optional): Whether to handle informal language (replace slang, expand abbreviations). Defaults to False.

### Returns:
     str: The cleaned text.

In [35]:
def extract_text_from_pdf(pdf_path):

  text = ""
  # Check if the file exists (optional, but recommended for robustness)
  if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

  with open(pdf_path, 'rb') as f:
    reader = PyPDF2.PdfReader(f)
    for page in range(len(reader.pages)):
      text += reader.pages[page].extract_text()  # Use reader.pages[page] instead of reader.getPage(page)
  return text

def clean_text(text, remove_stop_words=True, handle_emojis=True, handle_informal=False):

  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

  # Download nltk resources if not already installed (prevent errors)
  nltk.download('punkt', quiet=True)
  nltk.download('wordnet', quiet=True)
  nltk.download('stopwords', quiet=True)

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)  # Use word_tokenize for emoji detection

  # Handle emojis (optional)
  if handle_emojis:
    for i in range(len(tokens)):
      if emoji.demojize(tokens[i]) != tokens[i]:  # Check if token is an emoji
        # Replace emoji with text description (optional)
        # tokens[i] = emoji.demojize(tokens[i])  # Uncomment to replace with text description
        tokens[i] = "<EMOJI>"  # Replace with a placeholder

  # Lemmatize and handle stop words
  clean_tokens = []
  for token in tokens:
    clean_token = lemmatizer.lemmatize(token)
    if remove_stop_words and clean_token not in stopwords.words('english'):
      clean_tokens.append(clean_token)
    else:
      clean_tokens.append(clean_token)

  # Handle informal language (optional)
  if handle_informal:
    # Replace slang with formal equivalents (e.g., "gonna" -> "going to")
    # Expand abbreviations (e.g., "LOL" -> "Laughing Out Loud")
    # You can implement specific replacements here based on your data
    informal_to_formal = {
      "gonna": "going to",
      "lmao": "laughing my head off",
      # Add more replacements as needed
    }
    for i in range(len(clean_tokens)):
      if clean_tokens[i] in informal_to_formal:
        clean_tokens[i] = informal_to_formal[clean_tokens[i]]

  return " ".join(clean_tokens)

# Example usage (replace with your actual PDF path)



## **4. Feature Engineering**
1. ***Bag-of-Words (BoW):***
 This is a classic technique that represents text as a "bag" of words, ignoring the order of words but capturing their frequency.

In [36]:
from collections import Counter

def bag_of_words(text):
  """
  Extracts bag-of-words features from text.

  Args:
      text (str): The cleaned text.

  Returns:
      dict: A dictionary where keys are words and values are their frequencies.
  """
  words = text.split()
  return Counter(words)


***2. TF-IDF (Term Frequency-Inverse Document Frequency):*** This technique considers both the word frequency (TF) within a document and its rarity across documents (IDF). It assigns higher weights to words that are frequent within a document but less frequent overall.

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(cleaned_texts):
  """
  Extracts TF-IDF features from a list of cleaned texts.

  Args:
      cleaned_texts (list): A list of cleaned text strings.

  Returns:
      scipy.sparse.csr_matrix: A sparse TF-IDF matrix.
  """
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
  return tfidf_matrix


***3. N-Grams:*** This technique captures sequences of words (n-grams) like bigrams (2-word sequences) or trigrams (3-word sequences). These can be helpful for capturing phrases that might be indicative of emotions.

In [38]:
from nltk import ngrams

def ngram_features(text, n=2):
  """
  Extracts n-gram features from text.

  Args:
      text (str): The cleaned text.
      n (int, optional): The n-gram value (e.g., 2 for bigrams). Defaults to 2.

  Returns:
      list: A list of n-gram tuples.
  """
  tokens = text.split()
  return list(ngrams(tokens, n))


***4. Sentiment Lexicons:*** You can leverage existing sentiment lexicons that map words to their sentiment scores (positive, negative, neutral). This can provide additional features for emotion detection.

In [39]:
# Example using a sentiment lexicon (replace with your chosen lexicon)
def sentiment_lexicon_features(text, lexicon):
  """
  Extracts sentiment features from text using a lexicon.

  Args:
      text (str): The cleaned text.
      lexicon (dict): A dictionary mapping words to sentiment scores.

  Returns:
      tuple: A tuple containing positive and negative sentiment scores.
  """
  positive_score = 0
  negative_score = 0
  for word in text.split():
    if word in lexicon:
      score = lexicon[word]
      if score > 0:
        positive_score += score
      else:
        negative_score += score
  return positive_score, negative_score


***5. Text Length Features:*** The length of the text (number of words or characters) might be informative for some emotion detection tasks.

In [40]:
def text_length_features(text):
  """
  Extracts text length features.

  Args:
      text (str): The cleaned text.

  Returns:
      tuple: A tuple containing number of words and characters.
  """
  num_words = len(text.split())
  num_chars = len(text)
  return num_words, num_chars


## **Data Analysis**
Analyzes sentiment of cleaned text using VADER and predicts emotion as Positive, Negative, or Neutral.

In [42]:
# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Analyze the sentiment of the cleaned text
sentiment_scores = sia.polarity_scores(cleaned_text)

# Determine the emotion based on the sentiment scores
if sentiment_scores['compound'] >= 0.5:
    predicted_emotion = 'Positive'
elif sentiment_scores['compound'] <= -0.5:
    predicted_emotion = 'Negative'
else:
    predicted_emotion = 'Neutral'

print("Predicted Emotion:", predicted_emotion)

Predicted Emotion: Positive


## **5. Model Selection**
###  Logistic Regression Model:
* Trains a Logistic Regression model on the training embeddings and labels.
* Evaluates the model on the validation set using accuracy, precision, recall, and F1 score.

### Output:

* Prints the evaluation metrics for the Logistic Regression model.

In [57]:
# Define the text and labels (larger dataset)
texts = [
    "The park bench was a place of tranquillity amidst the bustling city.",
    "Suddenly, the sky darkened, and an ominous cloud gathered overhead.",
    "She loved the way the sunlight filtered through the trees.",
    "A gentle breeze rustled the leaves, creating a soothing melody.",
    "The bustling city life was far from her mind as she sat on the bench.",
    "Chaos erupted as the storm hit, with thunder and lightning.",
    "The peaceful park was now a scene of chaos.",
    "Birds chirped happily in the calm, sunny park.",
    "The once serene sky was now a foreboding grey.",
    "The sound of children playing added to the park's liveliness."
]
labels = [0, 1, 0, 0, 0, 1, 1, 0, 1, 0]  # Corresponding labels

# Ensure texts and labels have consistent lengths
assert len(texts) == len(labels), "The number of texts and labels must be the same"

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the texts
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Get BERT embeddings
with torch.no_grad():
    outputs = model(**encodings)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

# Split data into training and validation sets without stratification
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(
    embeddings, labels, test_size=0.1, random_state=42)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(train_embeddings, train_labels)
log_reg_preds = log_reg.predict(val_embeddings)

log_reg_acc = accuracy_score(val_labels, log_reg_preds)
log_reg_precision, log_reg_recall, log_reg_f1, _ = precision_recall_fscore_support(val_labels, log_reg_preds, average='binary')

print(f"Logistic Regression - Accuracy: {log_reg_acc}, Precision: {log_reg_precision}, Recall: {log_reg_recall}, F1 Score: {log_reg_f1}")

Logistic Regression - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0


### **Hyperparameter Tuning with Optuna:**

* We define an objective function for Optuna to optimize the hyperparameters of the Logistic Regression model.
* The objective function tries different values of C (regularization strength) and solver (optimization algorithm).
* We run 50 trials to find the best hyperparameters.


In [62]:
def objective(trial):
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])

    model = LogisticRegression(C=C, solver=solver, max_iter=1000)
    model.fit(train_embeddings, train_labels)
    preds = model.predict(val_embeddings)

    return accuracy_score(val_labels, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

[I 2024-06-23 20:49:06,165] A new study created in memory with name: no-name-c18801c8-1a78-4cae-9aa9-438e5a2295b1
[I 2024-06-23 20:49:06,248] Trial 0 finished with value: 0.0 and parameters: {'C': 0.0012061147118259553, 'solver': 'sag'}. Best is trial 0 with value: 0.0.
[I 2024-06-23 20:49:06,282] Trial 1 finished with value: 1.0 and parameters: {'C': 0.6404103867149268, 'solver': 'newton-cg'}. Best is trial 1 with value: 1.0.
[I 2024-06-23 20:49:06,304] Trial 2 finished with value: 0.0 and parameters: {'C': 0.0032858452210571673, 'solver': 'lbfgs'}. Best is trial 1 with value: 1.0.
[I 2024-06-23 20:49:06,328] Trial 3 finished with value: 0.0 and parameters: {'C': 0.04632762043744213, 'solver': 'newton-cg'}. Best is trial 1 with value: 1.0.
[I 2024-06-23 20:49:06,352] Trial 4 finished with value: 0.0 and parameters: {'C': 0.09112334754777286, 'solver': 'lbfgs'}. Best is trial 1 with value: 1.0.
[I 2024-06-23 20:49:06,369] Trial 5 finished with value: 0.0 and parameters: {'C': 8.3271923

Best hyperparameters: {'C': 0.6404103867149268, 'solver': 'newton-cg'}


In [63]:
final_model = LogisticRegression(**best_params, max_iter=1000)
final_model.fit(train_embeddings, train_labels)
final_preds = final_model.predict(val_embeddings)

final_acc = accuracy_score(val_labels, final_preds)
final_precision, final_recall, final_f1, _ = precision_recall_fscore_support(val_labels, final_preds, average='binary')

print(f"Final Logistic Regression - Accuracy: {final_acc}, Precision: {final_precision}, Recall: {final_recall}, F1 Score: {final_f1}")

Final Logistic Regression - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0


### **Cross-Validation:**

* We perform 5-fold cross-validation using KFold.
* For each fold, we split the data into training and testing sets, train a Logistic Regression model, and compute performance metrics.
* We average the metrics across all folds to get a final evaluation.

In [64]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_acc, cv_precision, cv_recall, cv_f1 = [], [], [], []

for train_index, test_index in kf.split(embeddings):
    X_train, X_test = embeddings[train_index], embeddings[test_index]
    y_train, y_test = [labels[i] for i in train_index], [labels[i] for i in test_index]

    model = LogisticRegression(**best_params, max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='binary')

    cv_acc.append(acc)
    cv_precision.append(precision)
    cv_recall.append(recall)
    cv_f1.append(f1)
    print(f"Cross-Validation - Accuracy: {sum(cv_acc)/len(cv_acc)}, Precision: {sum(cv_precision)/len(cv_precision)}, Recall: {sum(cv_recall)/len(cv_recall)}, F1 Score: {sum(cv_f1)/len(cv_f1)}")

Cross-Validation - Accuracy: 0.5, Precision: 1.0, Recall: 0.5, F1 Score: 0.6666666666666666
Cross-Validation - Accuracy: 0.75, Precision: 1.0, Recall: 0.75, F1 Score: 0.8333333333333333
Cross-Validation - Accuracy: 0.8333333333333334, Precision: 0.6666666666666666, Recall: 0.5, F1 Score: 0.5555555555555555
Cross-Validation - Accuracy: 0.875, Precision: 0.5, Recall: 0.375, F1 Score: 0.41666666666666663
Cross-Validation - Accuracy: 0.9, Precision: 0.6, Recall: 0.5, F1 Score: 0.5333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
import optuna
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def objective(trial):
    C = trial.suggest_float('C', 1e-5, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])

    model = LogisticRegression(C=C, solver=solver, max_iter=1000)
    model.fit(train_embeddings, train_labels)
    preds = model.predict(val_embeddings)

    return accuracy_score(val_labels, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Check if the output is best
if study.best_trial.value == 1.0:
    final_model = LogisticRegression(**best_params, max_iter=1000)
    final_model.fit(train_embeddings, train_labels)
    final_preds = final_model.predict(val_embeddings)

    final_acc = accuracy_score(val_labels, final_preds)
    final_precision, final_recall, final_f1, _ = precision_recall_fscore_support(val_labels, final_preds, average='binary')

    print(f"Final Logistic Regression - Accuracy: {final_acc}, Precision: {final_precision}, Recall: {final_recall}, F1 Score: {final_f1}")

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_acc, cv_precision, cv_recall, cv_f1 = [], [], [], []

    for train_index, test_index in kf.split(embeddings):
        X_train, X_test = embeddings[train_index], embeddings[test_index]
        y_train, y_test = [labels[i] for i in train_index], [labels[i] for i in test_index]

        model = LogisticRegression(**best_params, max_iter=1000)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='binary')

        cv_acc.append(acc)
        cv_precision.append(precision)
        cv_recall.append(recall)
        cv_f1.append(f1)

    print(f"Cross-Validation - Accuracy: {sum(cv_acc)/len(cv_acc)}, Precision: {sum(cv_precision)/len(cv_precision)}, Recall: {sum(cv_recall)/len(cv_recall)}, F1 Score: {sum(cv_f1)/len(cv_f1)}")
else:
    print("The best hyperparameters did not yield the highest accuracy. Further investigation may be needed.")


[I 2024-06-23 20:53:09,052] A new study created in memory with name: no-name-e22e5147-d985-4de6-b83b-55eb7caf5106
[I 2024-06-23 20:53:09,078] Trial 0 finished with value: 0.0 and parameters: {'C': 0.008179448425842796, 'solver': 'liblinear'}. Best is trial 0 with value: 0.0.
[I 2024-06-23 20:53:09,107] Trial 1 finished with value: 0.0 and parameters: {'C': 0.0014748493239140046, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.0.
[I 2024-06-23 20:53:09,138] Trial 2 finished with value: 0.0 and parameters: {'C': 0.00011728839403621752, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.0.
[I 2024-06-23 20:53:09,159] Trial 3 finished with value: 0.0 and parameters: {'C': 0.012672755416855608, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.0.
[I 2024-06-23 20:53:09,197] Trial 4 finished with value: 1.0 and parameters: {'C': 2.632739882493432, 'solver': 'newton-cg'}. Best is trial 4 with value: 1.0.
[I 2024-06-23 20:53:09,233] Trial 5 finished with value: 1.0 and parameters: {'C': 7

Best hyperparameters: {'C': 2.632739882493432, 'solver': 'newton-cg'}
Final Logistic Regression - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Cross-Validation - Accuracy: 1.0, Precision: 0.6, Recall: 0.6, F1 Score: 0.6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
