In [None]:
!pip install pandas
!pip install transformers
!pip install datasets
!pip install tensorflow
!pip install keras
!pip install sklearn
!pip install numpy
!pip install matplotlib
!pip install emoji
!pip install re

In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Sample code to load and merge datasets - replace with actual dataset loading code
def load_and_merge_datasets(file_paths):
    merged_data = pd.DataFrame(columns=['TweetID', 'Sentiment', 'Tweet'])
    for file_path in file_paths:
        data = pd.read_csv(file_path, sep='\t', header=None, names=['TweetID', 'Sentiment', 'Tweet'])
        merged_data = pd.concat([merged_data, data], ignore_index=True)
    return merged_data

In [None]:
# Paths to your datasets
train_files = ['twitter-2013train-A.tsv', 'twitter-2015train-A.tsv', 'twitter-2016train-A.tsv']
test_files = ['twitter-2013test-A.tsv', 'twitter-2014test-A.tsv', 'twitter-2015test-A.tsv', 'twitter-2016test-A.tsv']
dev_files = ['twitter-2013dev-A.tsv', 'twitter-2016dev-A.tsv', 'twitter-2016devtest-A.tsv']

In [None]:
# Load and merge datasets
train_data = load_and_merge_datasets(train_files)
test_data = load_and_merge_datasets(test_files)
dev_data = load_and_merge_datasets(dev_files)
combined_data = pd.concat([train_data, test_data, dev_data])

In [None]:
print(combined_data)

## Visualization of the Sentiment Distribution

In [None]:
# Ensure that the 'Sentiment' column is treated as a string and strip any whitespace
combined_data['Sentiment'] = combined_data['Sentiment'].astype(str).str.strip()

In [None]:
# Map sentiment labels to a consistent format if they're not already
sentiment_mapping = {'positive': 'positive', 'negative': 'negative', 'neutral': 'neutral'}
combined_data['Sentiment'] = combined_data['Sentiment'].map(sentiment_mapping)

Word Cloud

In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
positive_tweets = preprocess_data(data=train_data_non_nan, use_stemming=True, use_lemmatization=True, remove_emojis=True,
                    remove_urls_and_html_tags=True, remove_punctuation_and_special_chars=True,
                    remove_stopwords=True)[preprocess_data(data=train_data_non_nan, use_stemming=True,
                                                           use_lemmatization=True,
                                                           remove_emojis=True,
                                                           remove_urls_and_html_tags=True,
                                                           remove_punctuation_and_special_chars=True,
                                                           remove_stopwords=True)[
                                               'Sentiment'] == 'positive']['Tweet']
negative_tweets = preprocess_data(data=train_data_non_nan, use_stemming=True, use_lemmatization=True, remove_emojis=True,
                    remove_urls_and_html_tags=True, remove_punctuation_and_special_chars=True,
                    remove_stopwords=True)[preprocess_data(data=train_data_non_nan, use_stemming=True,
                                                           use_lemmatization=True,
                                                           remove_emojis=True,
                                                           remove_urls_and_html_tags=True,
                                                           remove_punctuation_and_special_chars=True,
                                                           remove_stopwords=True)
                                           ['Sentiment'] == 'negative']['Tweet']
neutral_tweets = preprocess_data(data=train_data_non_nan, use_stemming=True, use_lemmatization=True, remove_emojis=True,
                                 remove_urls_and_html_tags=True, remove_punctuation_and_special_chars=True,
                                 remove_stopwords=True)[preprocess_data(data=train_data_non_nan, use_stemming=True,
                                                                        use_lemmatization=True,
                                                                        remove_emojis=True,
                                                                        remove_urls_and_html_tags=True,
                                                                        remove_punctuation_and_special_chars=True,
                                                                        remove_stopwords=True)
                                                        ['Sentiment'] == 'neutral']["Tweet"]


In [None]:
# Generate Word Clouds
def generate_word_cloud(tweets, title):
    plt.figure(figsize=(10, 10))
    wc = WordCloud(background_color='white', max_words=500, width=800, height=400, collocations=False).generate(" ".join(tweets.astype(str)))
    plt.imshow(wc, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

In [None]:
generate_word_cloud(positive_tweets, 'Word Cloud for Positive Tweets')
generate_word_cloud(negative_tweets, 'Word Cloud for Negative Tweets')
generate_word_cloud(neutral_tweets, 'Word Cloud for Neutral Tweets')

**Confusion Matrix**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Drop rows with NaN values
combined_data = combined_data.dropna(subset=['Tweet', 'Sentiment'])

In [None]:
# Split the combined dataset into features (tweets) and labels (sentiments)
X = combined_data['Tweet']
y = combined_data['Sentiment']

In [None]:
# Vectorize the features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

In [None]:
# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [None]:
# Train the Logistic Regression model
model = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(sentiment_mapping.keys())
ax.yaxis.set_ticklabels(sentiment_mapping.keys())
plt.show()

Line Chart for Evaluation of the Sentiment Distribution for Years

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# File paths for the datasets of each year
file_paths = {
    '2013': ['twitter-2013train-A.tsv', 'twitter-2013dev-A.tsv', 'twitter-2013test-A.tsv'],
    '2014': ['twitter-2014sarcasm-A.tsv', 'twitter-2014test-A.tsv'],
    '2015': ['twitter-2015test-A.tsv', 'twitter-2015train-A.tsv'],
    '2016': ['twitter-2016dev-A.tsv', 'twitter-2016devtest-A.tsv', 'twitter-2016test-A.tsv', 'twitter-2016train-A.tsv']
}

In [None]:
# Initialize a dictionary to hold sentiment counts
sentiment_counts_by_year = {year: {'positive': 0, 'negative': 0, 'neutral': 0} for year in file_paths}

In [None]:
# Load the datasets and count sentiments for each year
for year, paths in file_paths.items():
    for path in paths:
        # Read the dataset
        data = pd.read_csv(f'{path}', sep='\t', header=None, names=['TweetID', 'Sentiment', 'Tweet'])

        # Drop rows with NaN values
        data = data.dropna(subset=['Tweet', 'Sentiment'])

        # Count the sentiments
        sentiment_counts = data['Sentiment'].value_counts()

        # Update the sentiment counts for the year
        for sentiment in ['positive', 'negative', 'neutral']:
            if sentiment in sentiment_counts:
                sentiment_counts_by_year[year][sentiment] += sentiment_counts[sentiment]


In [None]:
# Prepare the data for the plot
years = sorted(sentiment_counts_by_year.keys())
positives = [sentiment_counts_by_year[year]['positive'] for year in years]
negatives = [sentiment_counts_by_year[year]['negative'] for year in years]
neutrals = [sentiment_counts_by_year[year]['neutral'] for year in years]

In [None]:
# Create the line chart
plt.figure(figsize=(10, 6))
plt.plot(years, positives, 'g-o', label='Positive')  # Green line with dots for positives
plt.plot(years, negatives, 'r-o', label='Negative')  # Red line with dots for negatives
plt.plot(years, neutrals, 'b-o', label='Neutral')  # Blue line with dots for neutrals
plt.legend()

**Distribution of Sentiments (Bar Plot)**

In [None]:
# Calculate the sentiment counts
sentiment_counts = combined_data['Sentiment'].value_counts()

In [None]:
# Plot the sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Sentiment Distribution in the Combined Dataset')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Map sentiments to numerical values
sentiment_mapping = {'neutral': 0, 'negative': 1, 'positive': 2}
train_data['Sentiment'] = train_data['Sentiment'].map(sentiment_mapping)
test_data['Sentiment'] = test_data['Sentiment'].map(sentiment_mapping)
dev_data['Sentiment'] = dev_data['Sentiment'].map(sentiment_mapping)

Baseline Model

In [None]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['Tweet'].fillna(''))  # Fill NaN with empty strings
y_train = train_data['Sentiment']
X_test = vectorizer.transform(test_data['Tweet'].fillna(''))  # Also fill NaN with empty strings in the test set
y_test = test_data['Sentiment']

In [None]:
# Training a Logistic Regression model as a baseline
model = LogisticRegression(class_weight='balanced',C=0.8 ,max_iter=1000)
model.fit(X_train, y_train)

Removing the NaN value rows, because not allowed

In [None]:
# Find rows where y_test_no_stop is not NaN
valid_indices = y_test.notna()

# Filter both X_test_no_stop and y_test_no_stop to remove NaNs
X_test_filtered = X_test[valid_indices]
y_test_filtered = y_test[valid_indices]

In [None]:
# Evaluating the model
y_pred = model.predict(X_test_filtered)
report = classification_report(y_test_filtered, y_pred, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro = precision_score(y_test_filtered, y_pred, average='macro')
recall_macro = recall_score(y_test_filtered, y_pred, average='macro')
f1_macro = f1_score(y_test_filtered, y_pred, average='macro')
accuracy = accuracy_score(y_test_filtered, y_pred)

In [None]:
print("Classification Report for Baseline Model:\n", report)
print("Macro-average Precision:", precision_macro)
print("Macro-average Recall:", recall_macro)
print("Macro-average F1-score:", f1_macro)
print("Accuracy:", accuracy)

# Implementing Preprocessing Methods

# Removing Punctuation



In [None]:
def remove_punctuation_and_special_characters(text):
    # Remove punctuation and special characters
    text_without_punctuations = ''.join(char for char in text if char not in string.punctuation)

    return text_without_punctuations

# Removing URLs and HTML tags

In [None]:
import re

In [None]:
def remove_urls_and_html(text):
    # Remove URLs
    text_without_urls = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove HTML tags
    text_without_html = re.sub('<.*?>', '', text_without_urls)

    return text_without_html

# Stemming/Lemmatization

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize_text(text):
    # First apply stemming
    stemmed_text = ' '.join([stemmer.stem(word) for word in text.split()])
    # Then apply lemmatization
    return ' '.join([lemmatizer.lemmatize(word) for word in stemmed_text.split()])

In [None]:
# To not to change the raw dataset to use as it is on the next pre-processing methods,
# we wanted to create a temporary datasets that lemmatized and stemmed
train_data_stem_lem = train_data.copy()
train_data_stem_lem['Tweet'] = train_data_stem_lem['Tweet'].apply(stem_and_lemmatize_text)

In [None]:
# Fill NaN values with empty strings in both datasets
train_data_stem_lem['Tweet'] = train_data_stem_lem['Tweet'].fillna('')
test_data['Tweet'] = test_data['Tweet'].fillna('')

In [None]:
# Vectorization with TF-IDF
vectorizer_stem_lem = TfidfVectorizer(max_features=5000)
X_train_stem_lem = vectorizer_stem_lem.fit_transform(train_data_stem_lem['Tweet'])
y_train_stem_lem = train_data_stem_lem['Sentiment']
X_test_stem_lem = vectorizer_stem_lem.transform(test_data['Tweet'])  # Assuming test_data is already preprocessed
y_test_stem_lem = test_data['Sentiment']

In [None]:
# Training the model
model_stem_lem = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model_stem_lem.fit(X_train_stem_lem, y_train_stem_lem)

In [None]:
# Filtering out rows with NaN values in y_test_stem_lem
valid_indices = y_test_stem_lem.notna()

In [None]:
# Ensuring that X_test_stem_lem and y_test_stem_lem have the same rows
X_test_stem_lem_filtered = X_test_stem_lem[valid_indices]
y_test_stem_lem_filtered = y_test_stem_lem[valid_indices]

In [None]:
# Predicting and evaluating the model
y_pred_stem_lem = model_stem_lem.predict(X_test_stem_lem_filtered)
report_stem_lem = classification_report(y_test_stem_lem_filtered, y_pred_stem_lem, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_stem_lem = precision_score(y_test_stem_lem_filtered, y_pred_stem_lem, average='macro')
recall_macro_stem_lem = recall_score(y_test_stem_lem_filtered, y_pred_stem_lem, average='macro')
f1_macro_stem_lem = f1_score(y_test_stem_lem_filtered, y_pred_stem_lem, average='macro')
accuracy_stem_lem = accuracy_score(y_test_stem_lem_filtered, y_pred_stem_lem)

In [None]:
print("Classification Report for Model with Stemming and Lematization:\n", report_stem_lem)
print("macro-average Precision:", precision_macro_stem_lem)
print("macro-average Recall:", recall_macro_stem_lem)
print("macro-average F1-score:", f1_macro_stem_lem)
print("Accuracy:", accuracy_stem_lem)

As we can tell, stemming and lemmatization for a dataset like this, decreases the results taken.

# Stop Words Removal

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
# Create new temporary datasets with stop words removed
train_data_no_stop = train_data.copy()
train_data_no_stop['Tweet'] = train_data_no_stop['Tweet'].apply(remove_stopwords)

test_data_no_stop = test_data.copy()
test_data_no_stop['Tweet'] = test_data_no_stop['Tweet'].apply(remove_stopwords)

dev_data_no_stop = dev_data.copy()
dev_data_no_stop['Tweet'] = dev_data_no_stop['Tweet'].apply(remove_stopwords)

In [None]:
# Vectorization with TF-IDF
vectorizer_no_stop = TfidfVectorizer(max_features=5000)
X_train_no_stop = vectorizer_no_stop.fit_transform(train_data_no_stop['Tweet'])
y_train_no_stop = train_data_no_stop['Sentiment']
X_test_no_stop = vectorizer_no_stop.transform(test_data_no_stop['Tweet'])
y_test_no_stop = test_data_no_stop['Sentiment']

In [None]:
# Training the Logistic Regression model
model_no_stop = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model_no_stop.fit(X_train_no_stop, y_train_no_stop)

In [None]:
# Find rows where y_test_no_stop is not NaN
valid_indices = y_test_no_stop.notna()

# Filter both X_test_no_stop and y_test_no_stop to remove NaNs
X_test_no_stop_filtered = X_test_no_stop[valid_indices]
y_test_no_stop_filtered = y_test_no_stop[valid_indices]

In [None]:
# Predicting and evaluating the model
y_pred_no_stop = model_no_stop.predict(X_test_no_stop_filtered)
report_no_stop = classification_report(y_test_no_stop_filtered, y_pred_no_stop, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_no_stop = precision_score(y_test_no_stop_filtered, y_pred_no_stop, average='macro')
recall_macro_no_stop = recall_score(y_test_no_stop_filtered, y_pred_no_stop, average='macro')
f1_macro_no_stop = f1_score(y_test_no_stop_filtered, y_pred_no_stop, average='macro')
accuracy_no_stop = accuracy_score(y_test_no_stop_filtered, y_pred_no_stop)

In [None]:
print("Classification Report for Model with Stop Words Removed:\n", report_no_stop)
print("macro-average Precision:", precision_macro_no_stop)
print("macro-average Recall:", recall_macro_no_stop)
print("macro-average F1-score:", f1_macro_no_stop)
print("Accuracy:", accuracy_no_stop)

# Lowercasing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [None]:
# Copying the raw datasets
train_data = train_data.copy()
test_data = test_data.copy()
dev_data = dev_data.copy()

In [None]:
# Applying lowercasing to the copied datasets
train_data_lower = train_data.copy()
train_data_lower['Tweet'] = train_data_lower['Tweet'].str.lower()

test_data_lower = test_data.copy()
test_data_lower['Tweet'] = test_data_lower['Tweet'].str.lower()

dev_data_lower = dev_data.copy()
dev_data_lower['Tweet'] = dev_data_lower['Tweet'].str.lower()

In [None]:
# Vectorization with TF-IDF
vectorizer_lower = TfidfVectorizer(max_features=5000)
X_train_lower = vectorizer_lower.fit_transform(train_data_lower['Tweet'])
y_train_lower = train_data_lower['Sentiment']
X_test_lower = vectorizer_lower.transform(test_data_lower['Tweet'])
y_test_lower = test_data_lower['Sentiment']

In [None]:
# Training the SVM model (others were with Logistic Regression)
model_lower = SVC(kernel='linear')  # Using a linear kernel
model_lower.fit(X_train_lower, y_train_lower)

In [None]:
# Find rows where y_test_no_stop is not NaN
valid_indices = y_test_lower.notna()

# Filter both X_test_no_stop and y_test_no_stop to remove NaNs
X_test_lower_filtered = X_test_lower[valid_indices]
y_test_lower_filtered = y_test_lower[valid_indices]

In [None]:
# Predicting and evaluating the model
y_pred_lower = model_lower.predict(X_test_lower_filtered)
report_lower = classification_report(y_test_lower_filtered, y_pred_lower, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_lower = precision_score(y_test_lower_filtered, y_pred_lower, average='macro')
recall_macro_lower = recall_score(y_test_lower_filtered, y_pred_lower, average='macro')
f1_macro_lower = f1_score(y_test_lower_filtered, y_pred_lower, average='macro')
accuracy_lower = accuracy_score(y_test_lower_filtered, y_pred_lower)

In [None]:
# Print the evaluation metrics
print("Classification Report for Lowercased Model:\n", report_lower)
print("macro-average Precision:", precision_macro_lower)
print("macro-average Recall:", recall_macro_lower)
print("macro-average F1-score:", f1_macro_lower)
print("Accuracy:", accuracy_lower)

**Training The Model with Baseline (Logistic Regression)**

In [None]:
# Vectorization with TF-IDF
vectorizer_lower = TfidfVectorizer(max_features=5000)
X_train_lower = vectorizer_lower.fit_transform(train_data_lower['Tweet'])
y_train_lower = train_data_lower['Sentiment']
X_test_lower = vectorizer_lower.transform(test_data_lower['Tweet'])
y_test_lower = test_data_lower['Sentiment']

In [None]:
# Training the Logistic Regression model
model_lower_lr = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model_lower_lr.fit(X_train_lower, y_train_lower)

In [None]:
# Filter out NaN values and align X and y
valid_indices = y_test_lower.notna()
X_test_lower_filtered = X_test_lower[valid_indices]
y_test_lower_filtered = y_test_lower[valid_indices]

In [None]:
# Predicting and evaluating the model
y_pred_lower_lr = model_lower_lr.predict(X_test_lower_filtered)
report_lower_lr = classification_report(y_test_lower_filtered, y_pred_lower_lr, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_lower_lr = precision_score(y_test_lower_filtered, y_pred_lower_lr, average='macro')
recall_macro_lower_lr = recall_score(y_test_lower_filtered, y_pred_lower_lr, average='macro')
f1_macro_lower_lr = f1_score(y_test_lower_filtered, y_pred_lower_lr, average='macro')
accuracy_lower_lr = accuracy_score(y_test_lower_filtered, y_pred_lower_lr)

In [None]:
# Print the evaluation metrics
print("Classification Report for Lowercased Model with Logistic Regression:\n", report_lower_lr)
print("Macro-average Precision:", precision_macro_lower_lr)
print("Macro-average Recall:", recall_macro_lower_lr)
print("Macro-average F1-score:", f1_macro_lower_lr)
print("Accuracy:", accuracy_lower_lr)

# Removing Punctuation and Special Characters

In [None]:
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# Copying the raw datasets
train_data = train_data.copy()
test_data = test_data.copy()
dev_data = dev_data.copy()

In [None]:
# Apply the function to remove punctuation and special characters
train_data_no_punct = train_data.copy()
train_data_no_punct['Tweet'] = train_data_no_punct['Tweet'].apply(remove_punctuation)

test_data_no_punct = test_data.copy()
test_data_no_punct['Tweet'] = test_data_no_punct['Tweet'].apply(remove_punctuation)

dev_data_no_punct = dev_data.copy()
dev_data_no_punct['Tweet'] = dev_data_no_punct['Tweet'].apply(remove_punctuation)

In [None]:
# Vectorization with TF-IDF
vectorizer_no_punct = TfidfVectorizer(max_features=5000)
X_train_no_punct = vectorizer_no_punct.fit_transform(train_data_no_punct['Tweet'])
y_train_no_punct = train_data_no_punct['Sentiment']
X_test_no_punct = vectorizer_no_punct.transform(test_data_no_punct['Tweet'])
y_test_no_punct = test_data_no_punct['Sentiment']

In [None]:
# Training the Random Forest model
model_no_punct = RandomForestClassifier()
model_no_punct.fit(X_train_no_punct, y_train_no_punct)

In [None]:
# Find rows where y_test_no_stop is not NaN
valid_indices = y_test_no_punct.notna()

# Filter both X_test_no_stop and y_test_no_stop to remove NaNs
X_test_no_punct_filtered = X_test_no_punct[valid_indices]
y_test_no_punct_filtered = y_test_no_punct[valid_indices]

In [None]:
# Predicting and evaluating the model
y_pred_no_punct = model_no_punct.predict(X_test_no_punct_filtered)
report_no_punct = classification_report(y_test_no_punct_filtered, y_pred_no_punct, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_no_punct = precision_score(y_test_no_punct_filtered, y_pred_no_punct, average='macro')
recall_macro_no_punct = recall_score(y_test_no_punct_filtered, y_pred_no_punct, average='macro')
f1_macro_no_punct = f1_score(y_test_no_punct_filtered, y_pred_no_punct, average='macro')
accuracy_no_punct = accuracy_score(y_test_no_punct_filtered, y_pred_no_punct)

In [None]:
# Print the evaluation metrics
print("Classification Report for Punctuation Model:\n", report_no_punct)
print("macro-average Precision:", precision_macro_no_punct)
print("macro-average Recall:", recall_macro_no_punct)
print("macro-average F1-score:", f1_macro_no_punct)
print("Accuracy:", accuracy_no_punct)

**Training The Model with Baseline (Logistic Regression)**

In [None]:
# Training the Logistic Regression model
model_no_punct_lr = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model_no_punct_lr.fit(X_train_no_punct, y_train_no_punct)

In [None]:
# Filter out rows where y_test_no_punct is NaN
valid_indices = y_test_no_punct.notna()
X_test_no_punct_filtered = X_test_no_punct[valid_indices]
y_test_no_punct_filtered = y_test_no_punct[valid_indices]

In [None]:
# Predicting and evaluating the model with the filtered data
y_pred_no_punct_lr = model_no_punct_lr.predict(X_test_no_punct_filtered)
report_no_punct_lr = classification_report(y_test_no_punct_filtered, y_pred_no_punct_lr, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_no_punct_lr = precision_score(y_test_no_punct_filtered, y_pred_no_punct_lr, average='macro')
recall_macro_no_punct_lr = recall_score(y_test_no_punct_filtered, y_pred_no_punct_lr, average='macro')
f1_macro_no_punct_lr = f1_score(y_test_no_punct_filtered, y_pred_no_punct_lr, average='macro')
accuracy_no_punct_lr = accuracy_score(y_test_no_punct_filtered, y_pred_no_punct_lr)

In [None]:
# Print the evaluation metrics
print("Classification Report for Logistic Regression Model with Punctuation and Special Characters Removed:\n", report_no_punct_lr)
print("Macro-average Precision:", precision_macro_no_punct_lr)
print("Macro-average Recall:", recall_macro_no_punct_lr)
print("Macro-average F1-score:", f1_macro_no_punct_lr)
print("Accuracy:", accuracy_no_punct_lr)

# Removing URLs and HTML Tags

In [None]:
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [None]:
# Copying the raw datasets
train_data_no_url_html = train_data.copy()
train_data_no_url_html['Tweet'] = train_data_no_url_html['Tweet'].apply(remove_urls).apply(remove_html_tags)

test_data_no_url_html = test_data.copy()
test_data_no_url_html['Tweet'] = test_data_no_url_html['Tweet'].apply(remove_urls).apply(remove_html_tags)

dev_data_no_url_html = dev_data.copy()
dev_data_no_url_html['Tweet'] = dev_data_no_url_html['Tweet'].apply(remove_urls).apply(remove_html_tags)

In [None]:
# Filter out NaN values
train_data_no_url_html = train_data_no_url_html.dropna()
test_data_no_url_html = test_data_no_url_html.dropna()

In [None]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data_no_url_html['Tweet'])

X_train_nn = tokenizer.texts_to_sequences(train_data_no_url_html['Tweet'])
X_train_nn = pad_sequences(X_train_nn, maxlen=100)

X_test_nn = tokenizer.texts_to_sequences(test_data_no_url_html['Tweet'])
X_test_nn = pad_sequences(X_test_nn, maxlen=100)

In [None]:
# Convert labels to categorical
y_train_nn = to_categorical(np.array(train_data_no_url_html['Sentiment']))
y_test_nn = to_categorical(np.array(test_data_no_url_html['Sentiment']))

In [None]:
# Neural Network Model
model_nn = Sequential()
model_nn.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model_nn.add(GlobalAveragePooling1D())
model_nn.add(Dense(3, activation='softmax'))  # Assuming 3 classes: Neutral, Negative, Positive

model_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train_nn, y_train_nn, epochs=10, batch_size=32)

In [None]:
# Predicting and evaluating the model
y_pred_nn = np.argmax(model_nn.predict(X_test_nn), axis=-1)

In [None]:
# Convert y_test from categorical to single label
y_test_single_label = np.argmax(y_test_nn, axis=-1)

In [None]:
# Classification report and other metrics
report_nn = classification_report(y_test_single_label, y_pred_nn, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_nn = precision_score(y_test_single_label, y_pred_nn, average='macro')
recall_macro_nn = recall_score(y_test_single_label, y_pred_nn, average='macro')
f1_macro_nn = f1_score(y_test_single_label, y_pred_nn, average='macro')
accuracy_nn = accuracy_score(y_test_single_label, y_pred_nn)

In [None]:
# Print the evaluation metrics
print("Classification Report for Neural Network Model:\n", report_nn)
print("macro-average Precision:", precision_macro_nn)
print("macro-average Recall:", recall_macro_nn)
print("macro-average F1-score:", f1_macro_nn)
print("Accuracy:", accuracy_nn)

Training The Model with Baseline (Logistic Regression)

In [None]:
# Filter out NaN values
train_data_no_url_html = train_data_no_url_html.dropna()
test_data_no_url_html = test_data_no_url_html.dropna()

In [None]:
# Vectorization with TF-IDF
vectorizer_no_url_html = TfidfVectorizer(max_features=5000)
X_train_no_url_html = vectorizer_no_url_html.fit_transform(train_data_no_url_html['Tweet'])
y_train_no_url_html = train_data_no_url_html['Sentiment']
X_test_no_url_html = vectorizer_no_url_html.transform(test_data_no_url_html['Tweet'])
y_test_no_url_html = test_data_no_url_html['Sentiment']

In [None]:
# Training the Logistic Regression model
model_no_url_html_lr = LogisticRegression(class_weight='balanced', C=0.8, max_iter=1000)
model_no_url_html_lr.fit(X_train_no_url_html, y_train_no_url_html)

In [None]:
# Predicting and evaluating the model
y_pred_no_url_html_lr = model_no_url_html_lr.predict(X_test_no_url_html)
report_no_url_html_lr = classification_report(y_test_no_url_html, y_pred_no_url_html_lr, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_no_url_html_lr = precision_score(y_test_no_url_html, y_pred_no_url_html_lr, average='macro')
recall_macro_no_url_html_lr = recall_score(y_test_no_url_html, y_pred_no_url_html_lr, average='macro')
f1_macro_no_url_html_lr = f1_score(y_test_no_url_html, y_pred_no_url_html_lr, average='macro')
accuracy_no_url_html_lr = accuracy_score(y_test_no_url_html, y_pred_no_url_html_lr)

In [None]:
# Print the evaluation metrics
print("Classification Report for Logistic Regression Model with URLs and HTML Tags Removed:\n", report_no_url_html_lr)
print("Macro-average Precision:", precision_macro_no_url_html_lr)
print("Macro-average Recall:", recall_macro_no_url_html_lr)
print("Macro-average F1-score:", f1_macro_no_url_html_lr)
print("Accuracy:", accuracy_no_url_html_lr)

# Removing Numbers

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [None]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [None]:
# Copying the raw datasets
train_data_no_numbers = train_data.copy()
train_data_no_numbers['Tweet'] = train_data_no_numbers['Tweet'].apply(remove_numbers)

test_data_no_numbers = test_data.copy()
test_data_no_numbers['Tweet'] = test_data_no_numbers['Tweet'].apply(remove_numbers)

dev_data_no_numbers = dev_data.copy()
dev_data_no_numbers['Tweet'] = dev_data_no_numbers['Tweet'].apply(remove_numbers)

In [None]:
# Filter out NaN values
train_data_no_numbers = train_data_no_numbers.dropna()
test_data_no_numbers = test_data_no_numbers.dropna()
dev_data_no_numbers = dev_data_no_numbers.dropna()

In [None]:
# Vectorization with TF-IDF
vectorizer_no_numbers = TfidfVectorizer(max_features=5000)
X_train_no_numbers = vectorizer_no_numbers.fit_transform(train_data_no_numbers['Tweet'])
y_train_no_numbers = train_data_no_numbers['Sentiment']
X_test_no_numbers = vectorizer_no_numbers.transform(test_data_no_numbers['Tweet'])
y_test_no_numbers = test_data_no_numbers['Sentiment']

In [None]:
# Training the Logistic Regression model
model_no_numbers = LogisticRegression(class_weight='balanced',C=0.8,max_iter=1000)
model_no_numbers.fit(X_train_no_numbers, y_train_no_numbers)

In [None]:
# Predicting and evaluating the model
y_pred_no_numbers = model_no_numbers.predict(X_test_no_numbers)
report_no_numbers = classification_report(y_test_no_numbers, y_pred_no_numbers, target_names=['Neutral', 'Negative', 'Positive'])
precision_macro_no_numbers = precision_score(y_test_no_numbers, y_pred_no_numbers, average='macro')
recall_macro_no_numbers = recall_score(y_test_no_numbers, y_pred_no_numbers, average='macro')
f1_macro_no_numbers = f1_score(y_test_no_numbers, y_pred_no_numbers, average='macro')
accuracy_no_numbers = accuracy_score(y_test_no_numbers, y_pred_no_numbers)

In [None]:
# Print the evaluation metrics
print("Classification Report for Logistic Regression Model with Numbers Removed:\n", report_no_numbers)
print("macro-average Precision:", precision_macro_no_numbers)
print("macro-average Recall:", recall_macro_no_numbers)
print("macro-average F1-score:", f1_macro_no_numbers)
print("Accuracy:", accuracy_no_numbers)