## Sentiment Analytics - Common Approaches

### 1. Dictionary-Based Approach (Lexicon-Based)
NLTK (Natural Language Toolkit): Utilize NLTK's VADER (Valence Aware Dictionary and sEntiment Reasoner) tool, which is a pre-built sentiment analyzer trained on social media texts and works well for general-purpose sentiment analysis.
TextBlob: Leverages the Sentiment module, providing a simple interface to perform sentiment analysis based on a lexicon of words.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# NLTK VADER
sid = SentimentIntensityAnalyzer()
sentiment = sid.polarity_scores(article_text)

# TextBlob
blob = TextBlob(article_text)
sentiment = blob.sentiment

### 2. Leverage Pre-Trained LLMs

https://medium.com/geekculture/how-to-label-text-data-using-llms-f0ffc3fcd168

- OpenAI:

In [None]:
import openai

# Get the OpenAI API key by signing up on OpenAI.
openai.api_key = ""

# Zero-Shot Approach
# Directly asking the model to label the text.
def generate_text_labels(text, categories):
    labels = []
    text_label_mapping = {}

    # String of categories in which you want to classify the text.
    category_str = ", ".join(map(str, categories))
    
    # Sample Prompt
    # Example: I am happy today; Classify this sentence as Positive, Negative or Neutral in one word.
    for i in range(len(text)):
        response = openai.ChatCompletion.create(
                  model="gpt-3.5-turbo",
                  messages=[
                        {"role": "user", "content": f"{text[i]}; Classify this sentence as {category_str} in one word."},
                        # OR - For simple sentiment classification:
                        # {"role": "user", "content": f"Text: {text[i]} \nSentiment in one word:"},
                    ]
                )
        label = response.choices[0]["message"]["content"].strip(".")
        labels.append(label)
        text_label_mapping[text[i]] = label
    
    return labels, text_label_mapping

# Few-Shot Learning
# context = [("Sentence", "Category/Sentiment")]
def generate_text_labels_context(text, context):
    labels = []
    text_label_mapping = {}
    
    # Examples to help model understand the task and context
    context_string = str()
    for i in range(len(text)):
        context_string += f"Text: {context[i][0]}\nSentiment: {context[i][1]}\n"
        # OR
        # context_string += f"Text: {context[i][0]}\nCategory: {context[i][1]}\n"

    # Sample Prompt
    """
    Text: A
    Category/Sentiment: X
    Text: B
    Category/Sentiment: Y
    Text: C
    Category/Sentiment:
    """
    for i in range(len(text)):
        response = openai.ChatCompletion.create(
                  model="gpt-3.5-turbo",
                  messages=[
                        {"role": "user", "content": f"{context_string} \nText: {text[i]} \nSentiment:"},
                        # OR
                        # {"role": "user", "content": f"{context_string} \nText: {text[i]} \nCategory:"},
                    ]
                )
        labels.append(response.choices[0]["message"]["content"].strip("."))
        text_label_mapping[text[i]] = response.choices[0]["message"]["content"].strip(".")
    
    return labels, text_label_mapping

- Hugging Face - Goolge T5:

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Zero-Shot Approach
# Directly asking the model to label the text.
def generate_text_labels(text, categories):
    labels = []
    text_label_mapping = {}

    # String of categories in which you want to classify the text.
    category_str = ", ".join(map(str, categories))
    
    # Sample Prompt
    # Example: I am happy today; Classify this sentence as Positive, Negative or Neutral in one word.
    for i in range(len(text)):
        input_text = f"{text[i]}; Classify this sentence as {category_str} in one word."
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        outputs = model.generate(input_ids)
        label = tokenizer.decode(outputs[0])
        labels.append(label)
        text_label_mapping[text[i]] = label
    
    return labels, text_label_mapping

# Few-Shot Learning
# context = [("Sentence", "Category/Sentiment")]
def generate_text_labels_context(text, context):
    labels = []
    text_label_mapping = {}
    
    # Examples to help model understand the task and context
    context_string = str()
    for i in range(len(text)):
        context_string += f"Text: {context[i][0]}\nSentiment: {context[i][1]}\n"
        # OR
        # context_string += f"Text: {context[i][0]}\nCategory: {context[i][1]}\n"

    # Sample Prompt
    """
    Text: A
    Category/Sentiment: X
    Text: B
    Category/Sentiment: Y
    Text: C
    Category/Sentiment:
    """
    for i in range(len(text)):
        input_text = f"{context_string} \nBased on the above examples determine the sentiment of the following sentence. \nText: {text[i]} \nSentiment:"
        # OR
        # input_text = f"{context_string} \nBased on the above examples determine the category of the following sentence. \nText: {text[i]} \nCategory:"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        outputs = model.generate(input_ids)
        label = tokenizer.decode(outputs[0])
        labels.append(label)
        text_label_mapping[text[i]] = label
    
    return labels, text_label_mapping

### 3. Train New Models

Either create training sets manually or use LLM results for training

- SVM:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Assuming 'X' contains preprocessed article text and 'y' contains corresponding labels

# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing SVM and training the model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Predicting on test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)


- Logistic Regression:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming 'X' contains preprocessed article text and 'y' contains corresponding labels

# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing Logistic Regression model and training
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train_tfidf, y_train)

# Predicting on test set
y_pred = log_reg_model.predict(X_test_tfidf)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)


- Naive Bayes Classifier:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Assuming 'X' contains preprocessed article text and 'y' contains corresponding labels

# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing Naive Bayes Multinomial model and training
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predicting on test set
y_pred = nb_model.predict(X_test_tfidf)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)


- Neural Network (Keras)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assuming 'X' contains preprocessed article text and 'y' contains corresponding labels

# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Convert sparse matrix to array for Keras
X_train_array = np.array(X_train_tfidf.toarray())
X_test_array = np.array(X_test_tfidf.toarray())

# Initializing Neural Network model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_array.shape[1],), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_array, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test_array, y_test)
