In [1]:
import pandas as pd

# Load test dataset
test_data = pd.read_csv("train.csv")

test_data

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [2]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Function for data preprocessing
def preprocess_text(text):
    # Check if text is NaN or None
    if pd.isna(text):
        return ""
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphanumeric characters and URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

# Apply preprocessing to the 'text' column of the test dataset
test_data['preprocessed_text'] = test_data['text'].apply(preprocess_text)

# Display the preprocessed data
test_data

Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth


## Afinn

In [3]:
from afinn import Afinn

# Load the AFINN lexicon
afinn = Afinn()

def sentiment_afinn(text):
    # Calculate sentiment score using AFINN
    score = afinn.score(text)
    
    # Classify sentiment based on score
    if score > 0.06:
        return 'positive'
    elif score < -0.06:
        return 'negative'
    else:
        return 'neutral'

In [4]:
# Test the function with some example texts
texts = [
    "I love this product! It's amazing.",
    "This movie is terrible, I hated it.",
    "The weather today is neither good nor bad.",
    "I feel neutral about this situation."
]

for text in texts:
    sentiment = sentiment_afinn(text)
    print(f"Text: '{text}'\nSentiment: {sentiment}\n")

Text: 'I love this product! It's amazing.'
Sentiment: positive

Text: 'This movie is terrible, I hated it.'
Sentiment: negative

Text: 'The weather today is neither good nor bad.'
Sentiment: neutral

Text: 'I feel neutral about this situation.'
Sentiment: neutral



In [5]:
# Apply sentiment analysis to test dataset
test_data['predicted_sentiment_afinn'] = test_data['preprocessed_text'].apply(sentiment_afinn)

test_data

Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text,predicted_sentiment_afinn
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,negative
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought,neutral
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...,positive
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,positive


In [6]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate performance
accuracy = accuracy_score(test_data['sentiment'], test_data['predicted_sentiment_afinn'])
report = classification_report(test_data['sentiment'], test_data['predicted_sentiment_afinn'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.652014118845748
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.58      0.65      7781
     neutral       0.69      0.55      0.61     11118
    positive       0.58      0.85      0.69      8582

    accuracy                           0.65     27481
   macro avg       0.67      0.66      0.65     27481
weighted avg       0.67      0.65      0.65     27481



## TextBlob

In [7]:
from textblob import TextBlob

def sentiment_textblob(text):
    # Create a TextBlob object
    blob = TextBlob(text)
    
    # Perform sentiment analysis
    polarity = blob.sentiment.polarity
    
    # Classify sentiment based on polarity
    if polarity > 0.06:
        return 'positive'
    elif polarity < -0.06:
        return 'negative'
    else:
        return 'neutral'

In [8]:
# Test the function with some example texts
texts = [
    "I love this product! It's amazing.",
    "This movie is terrible, I hated it.",
    "The weather today is neither good nor bad.",
    "I feel neutral about this situation."
]

for text in texts:
    sentiment = sentiment_textblob(text)
    print(f"Text: '{text}'\nSentiment: {sentiment}\n")

Text: 'I love this product! It's amazing.'
Sentiment: positive

Text: 'This movie is terrible, I hated it.'
Sentiment: negative

Text: 'The weather today is neither good nor bad.'
Sentiment: neutral

Text: 'I feel neutral about this situation.'
Sentiment: neutral



In [9]:
# Apply sentiment analysis to test dataset
test_data['predicted_sentiment_textblob'] = test_data['preprocessed_text'].apply(sentiment_textblob)

test_data

Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text,predicted_sentiment_afinn,predicted_sentiment_textblob
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going,neutral,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,negative,negative
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying,negative,neutral
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone,negative,neutral
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought,neutral,neutral
...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,negative,neutral
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...,positive,positive
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...,positive,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,positive,positive


In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate performance
accuracy = accuracy_score(test_data['sentiment'], test_data['predicted_sentiment_textblob'])
report = classification_report(test_data['sentiment'], test_data['predicted_sentiment_textblob'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5938284633019177
Classification Report:
              precision    recall  f1-score   support

    negative       0.69      0.41      0.51      7781
     neutral       0.59      0.57      0.58     11118
    positive       0.56      0.79      0.66      8582

    accuracy                           0.59     27481
   macro avg       0.61      0.59      0.58     27481
weighted avg       0.61      0.59      0.59     27481



## VADER (Valence Aware Dictionary and Sentiment Reasoner)

In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

def sentiment_vader(text):
    # Calculate sentiment scores using VADER
    scores = vader_analyzer.polarity_scores(text)
    
    # Classify sentiment based on compound score
    if scores['compound'] > 0.06:
        return 'positive'
    elif scores['compound'] < -0.06:
        return 'negative'
    else:
        return 'neutral'

In [12]:
# Apply sentiment analysis to test dataset
test_data['predicted_sentiment_vader'] = test_data['preprocessed_text'].apply(sentiment_vader)

test_data

Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text,predicted_sentiment_afinn,predicted_sentiment_textblob,predicted_sentiment_vader
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going,neutral,neutral,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,negative,negative,negative
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying,negative,neutral,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone,negative,neutral,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought,neutral,neutral,neutral
...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,negative,neutral,positive
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...,positive,positive,positive
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...,positive,positive,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,positive,positive,positive


In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate performance
accuracy = accuracy_score(test_data['sentiment'], test_data['predicted_sentiment_vader'])
report = classification_report(test_data['sentiment'], test_data['predicted_sentiment_vader'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6314544594447072
Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.56      0.64      7781
     neutral       0.71      0.49      0.58     11118
    positive       0.54      0.88      0.67      8582

    accuracy                           0.63     27481
   macro avg       0.66      0.64      0.63     27481
weighted avg       0.67      0.63      0.62     27481



## SentiWordNet

In [14]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

def sentiment_sentiwordnet(text):
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Calculate sentiment scores
    pos_score = 0
    neg_score = 0
    for token in tokens:
        synsets = wn.synsets(token)
        for synset in synsets:
            senti_synset = swn.senti_synset(synset.name())
            pos_score += senti_synset.pos_score()
            neg_score += senti_synset.neg_score()
    
    # Classify sentiment based on scores
    if pos_score > neg_score + 0.06:
        return 'positive'
    elif neg_score > pos_score + 0.06:
        return 'negative'
    else:
        return 'neutral'

In [15]:
# Apply sentiment analysis to test dataset
test_data['predicted_sentiment_sentiwordnet'] = test_data['preprocessed_text'].apply(sentiment_sentiwordnet)

test_data

Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text,predicted_sentiment_afinn,predicted_sentiment_textblob,predicted_sentiment_vader,predicted_sentiment_sentiwordnet
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going,neutral,neutral,neutral,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,negative,negative,negative,negative
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying,negative,neutral,negative,neutral
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone,negative,neutral,negative,neutral
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought,neutral,neutral,neutral,positive
...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,negative,neutral,positive,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...,positive,positive,positive,positive
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...,positive,positive,positive,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,positive,positive,positive,positive


In [16]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate performance
accuracy = accuracy_score(test_data['sentiment'], test_data['predicted_sentiment_sentiwordnet'])
report = classification_report(test_data['sentiment'], test_data['predicted_sentiment_sentiwordnet'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.44874640660820203
Classification Report:
              precision    recall  f1-score   support

    negative       0.52      0.51      0.52      7781
     neutral       0.64      0.09      0.16     11118
    positive       0.40      0.86      0.55      8582

    accuracy                           0.45     27481
   macro avg       0.52      0.49      0.41     27481
weighted avg       0.53      0.45      0.38     27481



## Ensemble

In [17]:
from collections import Counter

# Function to perform ensemble sentiment analysis
def ensemble_sentiment_analysis(text):
    # Get predictions from each method
    afinn_sentiment = sentiment_afinn(text)
    textblob_sentiment = sentiment_textblob(text)
    vader_sentiment = sentiment_vader(text)
    
    # Combine predictions into a list
    predictions = [afinn_sentiment, textblob_sentiment, vader_sentiment]
    
    # Determine the final sentiment using majority voting
    final_sentiment = Counter(predictions).most_common(1)[0][0]
    
    return final_sentiment

# Apply ensemble sentiment analysis to test dataset
test_data['predicted_sentiment_ensemble'] = test_data['preprocessed_text'].apply(ensemble_sentiment_analysis)

# Display the results
test_data


Unnamed: 0,textID,text,selected_text,sentiment,preprocessed_text,predicted_sentiment_afinn,predicted_sentiment_textblob,predicted_sentiment_vader,predicted_sentiment_sentiwordnet,predicted_sentiment_ensemble
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,responded going,neutral,neutral,neutral,neutral,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,negative,negative,negative,negative,negative
2,088c60f138,my boss is bullying me...,bullying me,negative,bos bullying,negative,neutral,negative,neutral,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leave alone,negative,neutral,negative,neutral,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son put release already bought,neutral,neutral,neutral,positive,neutral
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,negative,neutral,positive,negative,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wondered rake client made clear net force devs...,positive,positive,positive,positive,positive
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probably need hectic week...,positive,positive,positive,positive,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth,positive,positive,positive,positive,positive


In [18]:
# Evaluate performance
accuracy = accuracy_score(test_data['sentiment'], test_data['predicted_sentiment_ensemble'])
report = classification_report(test_data['sentiment'], test_data['predicted_sentiment_ensemble'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6469924675230159
Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.57      0.65      7781
     neutral       0.70      0.53      0.60     11118
    positive       0.57      0.86      0.68      8582

    accuracy                           0.65     27481
   macro avg       0.67      0.66      0.65     27481
weighted avg       0.67      0.65      0.64     27481

