In [2]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Load the NLTK data needed for tokenization and stopword removal
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('/content/final_data.csv')
dataset = pd.DataFrame(data)

# Define a function to preprocess a tweet
def preprocess_tweet(tweet_text):
    # Remove URLs
    tweet_text = re.sub(r'http\S+', '', tweet_text)

    # Remove punctuation
    tweet_text = tweet_text.translate(str.maketrans('', '', string.punctuation))

    # Remove special characters
    tweet_text = re.sub(r'[^A-Za-z0-9\s]', '', tweet_text)

    # Remove emojis
    tweet_text = re.sub(r'[^\x00-\x7F]+', '', tweet_text)

    # Tokenize the tweet
    tokens = word_tokenize(tweet_text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    tweet_text = ' '.join(tokens)

    return tweet_text

# Define a function to perform sentiment analysis using VADER
def sentiment_analysis(tweet_text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(tweet_text)
    return sentiment

# Filter out rows with non-string values in the 'tweet_text' column
dataset = dataset[dataset['tweet_text'].apply(lambda x: isinstance(x, str))]

# Apply preprocessing and sentiment analysis to the dataset
dataset['preprocessed_tweet'] = dataset['tweet_text'].apply(preprocess_tweet)
dataset['sentiment'] = dataset['preprocessed_tweet'].apply(sentiment_analysis)

# Example usage:
print(dataset.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                          tweet_text  \
0   I have a 3G iPhone. After 3 hrs tweeting at #...   
1   Know about @fludapp ? Awesome iPad/iPhone app...   
2  Can not wait for #iPad 2 also. They should sal...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at  \
0                          iPhone   
1              iPad or iPhone App   
2                            iPad   
3              iPad or iPhone App   
4                          Google   

  is_there_an_emotion_directed_at_a_brand_or_product  \
0                                   Negative emotion   
1                                   Positive emotion   
2                                   Positive emotion   
3                                   Negative emotion   
4                                   Positive emotion   

                                  preprocessed_tweet  \
0  3g iphone 3 hr tweeting riseaustin dead need

In [4]:
!pip install transformers
!pip install torch
!pip install pandas


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
from transformers import pipeline

# Load the emotion detection pipeline
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

# Preprocess tweet function (if not already implemented)
def preprocess_tweet(tweet_text):
    # (Add your preprocessing steps here)
    return tweet_text

# Function to classify emotions
def classify_emotions(tweet_text):
    processed_text = preprocess_tweet(tweet_text)
    emotion_scores = emotion_pipeline(processed_text)
    # Convert the scores to a dictionary
    emotion_dict = {item['label']: item['score'] for item in emotion_scores[0]}
    return emotion_dict

# Apply the classification to the dataset
dataset['emotion_scores'] = dataset['tweet_text'].apply(classify_emotions)

# Example usage:
print(dataset.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



                                          tweet_text  \
0   I have a 3G iPhone. After 3 hrs tweeting at #...   
1   Know about @fludapp ? Awesome iPad/iPhone app...   
2  Can not wait for #iPad 2 also. They should sal...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at  \
0                          iPhone   
1              iPad or iPhone App   
2                            iPad   
3              iPad or iPhone App   
4                          Google   

  is_there_an_emotion_directed_at_a_brand_or_product  \
0                                   Negative emotion   
1                                   Positive emotion   
2                                   Positive emotion   
3                                   Negative emotion   
4                                   Positive emotion   

                                  preprocessed_tweet  \
0  3g iphone 3 hr tweeting riseaustin dead need

In [None]:
import pandas as pd
from transformers import pipeline
import random

# Load the sentiment analysis pipeline from Hugging Face
sentiment_pipeline = pipeline("sentiment-analysis")

# Function to perform sentiment analysis
def sentiment_analysis(text):
    return sentiment_pipeline(text)[0]

# Select a random tweet from the dataset
random_tweet = dataset.sample(1).iloc[0]

# Perform sentiment analysis on the random tweet
tweet_text = random_tweet['tweet_text']
sentiment_scores = sentiment_analysis(tweet_text)

# Get the emotion score
emotion_score = random_tweet['is_there_an_emotion_directed_at_a_brand_or_product']

print(f"Tweet: {tweet_text}")
print(f"Sentiment Scores: {sentiment_scores}")
print(f"Emotion Score: {emotion_score}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Tweet: Was intending to bring my trusty backpack for this upcoming trip to #SXSW. With the number of iPad 2s I'm helping buy back, no es posible.
Sentiment Scores: {'label': 'NEGATIVE', 'score': 0.9915809035301208}
Emotion Score: No emotion toward brand or product


In [None]:
import pandas as pd
from transformers import pipeline

# Load the dataset
file_path = '/content/final_data.csv'
dataset = pd.read_csv(file_path)

# Filter out rows with non-string values in the 'tweet_text' column
dataset = dataset[dataset['tweet_text'].apply(lambda x: isinstance(x, str))]

# Define a simple preprocessing function (you can expand this based on your needs)
def preprocess_tweet(text):
    # Convert text to lowercase
    text = text.lower()
    # You can add more preprocessing steps here
    return text

# Create a sentiment analysis pipeline
emotion_pipeline = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilroberta-base")

# Function to classify emotions
def classify_emotions(preprocess_tweet):
    result = emotion_pipeline(preprocess_tweet)
    return result

# Apply preprocessing and emotion classification to the dataset
dataset['preprocessed_tweet'] = dataset['tweet_text'].apply(preprocess_tweet)
dataset['emotion'] = dataset['preprocessed_tweet'].apply(classify_emotions)

# Display the first 10 rows of the dataset with emotion classification
print(dataset.head(10))


                                           tweet_text  \
0    I have a 3G iPhone. After 3 hrs tweeting at #...   
1    Know about @fludapp ? Awesome iPad/iPhone app...   
2   Can not wait for #iPad 2 also. They should sal...   
3   @sxsw I hope this year's festival isn't as cra...   
4   @sxtxstate great stuff on Fri #SXSW: Marissa M...   
5   @teachntech00 New iPad Apps For #SpeechTherapy...   
7   #SXSW is just starting, #CTIA is around the co...   
8   Beautifully smart and simple idea RT @madebyma...   
9   Counting down the days to #sxsw plus strong Ca...   
10  Excited to meet the @samsungmobileus at #sxsw ...   

   emotion_in_tweet_is_directed_at  \
0                           iPhone   
1               iPad or iPhone App   
2                             iPad   
3               iPad or iPhone App   
4                           Google   
5                              NaN   
7                          Android   
8               iPad or iPhone App   
9                            A

In [None]:
import pandas as pd
from transformers import pipeline

# Load the dataset
file_path = '/content/final_data.csv'
dataset = pd.read_csv(file_path)

# Filter out rows with non-string values in the 'tweet_text' column
dataset = dataset[dataset['tweet_text'].apply(lambda x: isinstance(x, str))]

# Define a simple preprocessing function (you can expand this based on your needs)
def preprocess_tweet(text):
    # Convert text to lowercase
    text = text.lower()
    # You can add more preprocessing steps here
    return text

# Create a sentiment analysis pipeline
emotion_pipeline = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilroberta-base")

# Function to classify emotions
def classify_emotions(preprocess_tweet):
    result = emotion_pipeline(preprocess_tweet)
    # Extract the emotion label and score from the result
    emotion_label = result[0]['label']
    emotion_score = result[0]['score']
    return f"{emotion_label} ({emotion_score:.2f})"

# Apply preprocessing and emotion classification to the dataset
dataset['preprocessed_tweet'] = dataset['tweet_text'].apply(preprocess_tweet)
dataset['emotion'] = dataset['preprocessed_tweet'].apply(classify_emotions)

# Display the first 10 rows of the dataset with emotion classification
print(dataset.head(10)[['tweet_text', 'emotion']].to_string(index=False, formatters={'emotion': lambda x: x.ljust(20)}))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


                                                                                                                                  tweet_text              emotion
                        I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW. sadness (0.67)      
           Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW joy (0.69)          
                                                                         Can not wait for #iPad 2 also. They should sale them down at #SXSW. neutral (0.45)      
                                                          @sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw sadness (0.63)      
         @sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress) joy (0.88)          
@teachntech00 New iPad Apps 

In [None]:
import pandas as pd
from transformers import pipeline

# Load the dataset
file_path = '/content/final_data.csv'
dataset = pd.read_csv(file_path)

# Filter out rows with non-string values in the 'tweet_text' column
dataset = dataset[dataset['tweet_text'].apply(lambda x: isinstance(x, str))]

# Define a simple preprocessing function (you can expand this based on your needs)
def preprocess_tweet(text):
    # Convert text to lowercase
    text = text.lower()
    # You can add more preprocessing steps here
    return text

# Create a sentiment analysis pipeline
emotion_pipeline = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilroberta-base")

# Apply preprocessing to the dataset
dataset['preprocessed_tweet'] = dataset['tweet_text'].apply(preprocess_tweet)

# Create lists to store the emotion labels and scores
emotion_labels = []
emotion_scores = []

# Loop through the preprocessed tweets and classify emotions
for tweet in dataset['preprocessed_tweet']:
    result = emotion_pipeline(tweet)
    # Extract the emotion label and score from the result
    emotion_label = result[0]['label']
    emotion_score = result[0]['score']
    emotion_labels.append(emotion_label)
    emotion_scores.append(emotion_score)

# Add the emotion labels and scores to the dataset
dataset['emotion_label'] = emotion_labels
dataset['emotion_score'] = emotion_scores

# Display the first 10 rows of the dataset with all four columns
print(dataset.head(10)[[ 'preprocessed_tweet', 'emotion_label', 'emotion_score']].to_string(index=False))

                                                                                                                          preprocessed_tweet emotion_label  emotion_score
                        i have a 3g iphone. after 3 hrs tweeting at #rise_austin, it was dead!  i need to upgrade. plugin stations at #sxsw.       sadness       0.666496
           know about @fludapp ? awesome ipad/iphone app that you'll likely appreciate for its design. also, they're giving free ts at #sxsw           joy       0.693497
                                                                         can not wait for #ipad 2 also. they should sale them down at #sxsw.       neutral       0.450823
                                                          @sxsw i hope this year's festival isn't as crashy as this year's iphone app. #sxsw       sadness       0.634486
         @sxtxstate great stuff on fri #sxsw: marissa mayer (google), tim o'reilly (tech books/conferences) &amp; matt mullenweg (wordpress)          

In [5]:
import pandas as pd
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load the emotion detection pipeline
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

# Initialize the VADER sentiment analyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

# Sample data to mimic the dataset you provided
data = {
    "tweet_text": [
        "I have a 3G iPhone. After 3 hrs tweeting at #riseaustin I am dead. Need an upgrade!",
        "Know about @fludapp? Awesome iPad/iPhone app you'll like!",
        "Can not wait for #iPad 2 also. They should sale @sxsw.",
        "@sxsw I hope this year's festival isn't as crashy as last year. iPhone issues!",
        "@sxtxstate great stuff on Fri #SXSW: Marissa Mayer Google keynote was awesome!"
    ],
    "emotion_in_tweet_is_directed_at": [
        "iPhone", "iPad or iPhone App", "iPad", "iPad or iPhone App", "Google"
    ],
    "is_there_an_emotion_directed_at_a_brand_or_product": [
        "Negative emotion", "Positive emotion", "Positive emotion", "Negative emotion", "Positive emotion"
    ],
    "preprocessed_tweet": [
        "3g iphone 3 hr tweeting riseaustin dead need upgrade",
        "know fludapp awesome ipadiphone app youll like",
        "wait ipad 2 also sale sxsw",
        "sxsw hope year festival isnt crashy year iphone issues",
        "sxtxstate great stuff fri sxsw marissa mayer google keynote awesome"
    ],
    "sentiment": [
        {'neg': 0.281, 'neu': 0.719, 'pos': 0.0, 'compound': -0.6369},
        {'neg': 0.0, 'neu': 0.468, 'pos': 0.532, 'compound': 0.802},
        {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
        {'neg': 0.0, 'neu': 0.567, 'pos': 0.433, 'compound': 0.5994},
        {'neg': 0.0, 'neu': 0.785, 'pos': 0.215, 'compound': 0.4215}
    ]
}

# Convert to DataFrame
dataset = pd.DataFrame(data)

# Preprocess tweet function (if not already implemented)
def preprocess_text(text):
    # Add any preprocessing steps if needed (e.g., lowercasing, removing special characters)
    return text

# Function to classify emotions
def classify_emotions(text):
    processed_text = preprocess_text(text)
    emotion_scores = emotion_pipeline(processed_text)
    # Convert the scores to a dictionary
    emotion_dict = {item['label']: item['score'] for item in emotion_scores}
    return emotion_dict

# Function to classify sentiment
def classify_sentiment(text):
    processed_text = preprocess_text(text)
    sentiment_scores = sentiment_analyzer.polarity_scores(processed_text)
    return sentiment_scores

# Fetch a sample text from the dataset
sample_text = dataset['tweet_text'].iloc[0]

# Get sentiment and emotion for the sample text
emotion_result = classify_emotions(sample_text)
sentiment_result = classify_sentiment(sample_text)

print("Sample Text: ", sample_text)
print("Emotion Scores: ", emotion_result)
print("Sentiment Scores: ", sentiment_result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Sample Text:  I have a 3G iPhone. After 3 hrs tweeting at #riseaustin I am dead. Need an upgrade!
Emotion Scores:  {'sadness': 0.9336184859275818}
Sentiment Scores:  {'neg': 0.223, 'neu': 0.777, 'pos': 0.0, 'compound': -0.68}
