## loading datasets
i'll start by loading the datasets into pandas dataframes, and then i'll perform some basic cleaning and preprocessing.

In [119]:
# import the necessary libraries
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize
# define the path to the data
path_to_data = '../data/raw/'

In [120]:
# read the csv files content
with open(os.path.join(path_to_data, 'processedPositive.csv'), 'r') as file:
    happy_content = file.read()
with open(os.path.join(path_to_data, 'processedNegative.csv'), 'r') as file:
    sad_content = file.read()
with open(os.path.join(path_to_data, 'processedNeutral.csv'), 'r') as file:
    neutral_content = file.read()

# split the content into individual tweets
happy_sentences = sent_tokenize(happy_content)
happy_tweets = pd.DataFrame(happy_sentences, columns=['text'])
happy_tweets['sentiment'] = 1
sad_sentences = sent_tokenize(sad_content)
sad_tweets = pd.DataFrame(sad_sentences, columns=['text'])
sad_tweets['sentiment'] = -1
neutral_sentences = sent_tokenize(neutral_content)
neutral_tweets = pd.DataFrame(neutral_sentences, columns=['text'])
neutral_tweets['sentiment'] = 0

print(happy_tweets.head())
print(happy_tweets.shape)
print(sad_tweets.head())
print(sad_tweets.shape)
print(neutral_tweets.head())
print(neutral_tweets.shape)


                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(587, 2)
                                                text  sentiment
0  How unhappy  some dogs like it though,talking ...         -1
1  I got some money  I need to change into R but ...         -1
2  unhappy ,it's that A*dy guy from pop Asia and ...         -1
3                            Is this how I find out.         -1
4                                Everyone knows now.         -1
(357, 2)
                                                text  sentiment
0  Pak PM survives removal scare, but court order...          0
1  ,Supreme Court quashes criminal complaint agai...          0
2  ,FCRA slap on NGO f

##  merge and clean the data
in this step we'll clean the data by removing the duplicates and stop words, which are often meaningless words that can add noise to the dataset rather than meaningful information

In [121]:
merged_tweets = pd.concat([happy_tweets, sad_tweets, neutral_tweets], ignore_index=True)
merged_tweets.dropna(inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)

                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(1628, 2)


In [122]:
import nltk
from nltk.corpus import stopwords

# transform the text to lowercase
merged_tweets['text'] = merged_tweets['text'].str.lower()

nltk.download('stopwords')
# remove stop words
merged_tweets['text'] = merged_tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# remove duplicates
merged_tweets.drop_duplicates(subset=['text'], inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrooma/miniconda3/envs/tweets/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  sentiment
0  inspiration aspects: fashion, fitness, beauty ...          1
1  :)kisses thefashionicon,apka apna awam ka chan...          1
2  donate?,omg he... kissed... crying joy,happy a...          1
3  love love happy,thanks happy,c'mon tweeps, joi...          1
4                                       spread word.          1
(1533, 2)


## split the data with stratification
making a split on the train and test (20%) datasets with stratification.

In [123]:
from sklearn.model_selection import train_test_split

train_tweets, test_tweets = train_test_split(merged_tweets,
                                             test_size=0.2,
                                             stratify=merged_tweets['sentiment'],
                                             random_state=42)

print(train_tweets.shape)
print(test_tweets.shape)

(1226, 2)
(307, 2)


In [124]:
# check the distribution of the sentiment in the original, train and test datasets
print("Original dataset:")
print(merged_tweets['sentiment'].value_counts(normalize=True))
print("Train dataset:")
print(train_tweets['sentiment'].value_counts(normalize=True))
print("Test dataset:")
print(test_tweets['sentiment'].value_counts(normalize=True))



Original dataset:
sentiment
 0    0.397913
 1    0.371168
-1    0.230920
Name: proportion, dtype: float64
Train dataset:
sentiment
 0    0.398042
 1    0.371126
-1    0.230832
Name: proportion, dtype: float64
Test dataset:
sentiment
 0    0.397394
 1    0.371336
-1    0.231270
Name: proportion, dtype: float64


## preprocess and vectorize the data

in this step i'll prepare multiple datasets, each dataset is a combination of a preprocessing approach and a vectorization approach.

-- preprocessing approaches :
- stemming
- lemmatization
- stemming + mispellings correction
- lemmatization + mispellings correction

-- vectorization approaches :
- binary vectorization
- word counts
- tf-idf

In [125]:
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

# import the preprocessing and vectorization functions
from utils.text_preprocessing import stem_text, lemmatize_text, correct_spelling
from utils.text_vectorization import binary_vectorizer, count_vectorizer, tfidf_vectorizer


In [126]:
# dictionnary to store the datasets
datasets = {}
vectorizers = {}

# preprocessing and vectorization dictionary
preprocessing_methods = {
    'stemming' : stem_text,
    'lemmatization' : lemmatize_text,
    'stemming_misspelling_correction' : lambda x: correct_spelling(stem_text(x)),
    'lemmatization_misspelling_correction' : lambda x: correct_spelling(lemmatize_text(x))
}

vectorization_methods = {
    'binary' : binary_vectorizer,
    'word_counts' : count_vectorizer,
    'tf-idf' : tfidf_vectorizer
}



In [127]:
for prep_name, prep_func in preprocessing_methods.items():
    # preprocess the train and test data
    train_preprocessed = train_tweets['text'].apply(prep_func)
    test_preprocessed = test_tweets['text'].apply(prep_func)

    for vec_name, vec_fund in vectorization_methods.items():
        train_vectorized, vectorizer = vec_fund(train_preprocessed)
        test_vectorized = vectorizer.transform(test_preprocessed)

        # store the datasets with labels
        dataset_key = f'{prep_name}_{vec_name}'
        datasets[dataset_key] = (train_vectorized, train_tweets['sentiment'])
        datasets[f'{dataset_key}_test'] = (test_vectorized, test_tweets['sentiment'])

        vectorizers[dataset_key] = vectorizer

print(datasets.keys())

dict_keys(['stemming_binary', 'stemming_binary_test', 'stemming_word_counts', 'stemming_word_counts_test', 'stemming_tf-idf', 'stemming_tf-idf_test', 'lemmatization_binary', 'lemmatization_binary_test', 'lemmatization_word_counts', 'lemmatization_word_counts_test', 'lemmatization_tf-idf', 'lemmatization_tf-idf_test', 'stemming_misspelling_correction_binary', 'stemming_misspelling_correction_binary_test', 'stemming_misspelling_correction_word_counts', 'stemming_misspelling_correction_word_counts_test', 'stemming_misspelling_correction_tf-idf', 'stemming_misspelling_correction_tf-idf_test', 'lemmatization_misspelling_correction_binary', 'lemmatization_misspelling_correction_binary_test', 'lemmatization_misspelling_correction_word_counts', 'lemmatization_misspelling_correction_word_counts_test', 'lemmatization_misspelling_correction_tf-idf', 'lemmatization_misspelling_correction_tf-idf_test'])


now, we'll try to see the content of each dataset in the dataset dictionnary

In [128]:
for dataset_name, (vectorized_data, labels) in datasets.items():
    print(f"Dataset: {dataset_name}")
    print("Data :", vectorized_data.shape)
    print("Labels :", labels.unique()) 

Dataset: stemming_binary
Data : (1226, 4563)
Labels : [ 1  0 -1]
Dataset: stemming_binary_test
Data : (307, 4563)
Labels : [ 1 -1  0]
Dataset: stemming_word_counts
Data : (1226, 4563)
Labels : [ 1  0 -1]
Dataset: stemming_word_counts_test
Data : (307, 4563)
Labels : [ 1 -1  0]
Dataset: stemming_tf-idf
Data : (1226, 4563)
Labels : [ 1  0 -1]
Dataset: stemming_tf-idf_test
Data : (307, 4563)
Labels : [ 1 -1  0]
Dataset: lemmatization_binary
Data : (1226, 4803)
Labels : [ 1  0 -1]
Dataset: lemmatization_binary_test
Data : (307, 4803)
Labels : [ 1 -1  0]
Dataset: lemmatization_word_counts
Data : (1226, 4803)
Labels : [ 1  0 -1]
Dataset: lemmatization_word_counts_test
Data : (307, 4803)
Labels : [ 1 -1  0]
Dataset: lemmatization_tf-idf
Data : (1226, 4803)
Labels : [ 1  0 -1]
Dataset: lemmatization_tf-idf_test
Data : (307, 4803)
Labels : [ 1 -1  0]
Dataset: stemming_misspelling_correction_binary
Data : (1226, 4563)
Labels : [ 1  0 -1]
Dataset: stemming_misspelling_correction_binary_test
Data 

In [129]:
first_dataset_name = list(datasets.keys())[0]
vectorized_data, labels = datasets[first_dataset_name]

index = 384

original_text = train_tweets.iloc[index]
label = labels.iloc[index]

print(f"Original Text: {original_text}")
print(f"Label: {label}")


Original Text: text         ,from may 1, vehicles: gadkari.,no cars pm, mi...
sentiment                                                    0
Name: 939, dtype: object
Label: 0


## top 10 similar tweets

In [130]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [131]:
def find_top_n_similar_tweets(vectorized_data, n=10):
    # compute cosine similarity matrix
    similarity_matrix = cosine_similarity(vectorized_data)
    
    # set the diagonal of the similarity matrix to 0 to ignore self-similarity
    np.fill_diagonal(similarity_matrix, 0)

    # since the matrix is symmetric, we need one triangle (upper or lower)
    num_tweets = similarity_matrix.shape[0]
    upper_triangle_indices = np.triu_indices(num_tweets, k=1)
    flat_similarities = similarity_matrix[upper_triangle_indices]

    # get the indices of the top 10 most similar tweet pairs
    top_10_indices_flat = np.argsort(flat_similarities)[-10:]
    top_10_pairs = [(upper_triangle_indices[0][i], upper_triangle_indices[1][i]) for i in top_10_indices_flat]

    # print the indices and similarity scores of the top 10 most similar tweet pairs
    for idx1, idx2 in top_10_pairs:
        tweet1 = train_tweets.iloc[idx1]['text']
        tweet2 = train_tweets.iloc[idx2]['text']
        print(f"Tweet indices : ({idx1}, {idx2}), Similarity: {similarity_matrix[idx1, idx2]}")
        print(f"Tweet 1 : {tweet1}")
        print(f"Tweet 2: {tweet2}")
        print("-" * 50)

In [132]:
top_similar_tweets = {}
for dataset_name, (vectorized_data, labels) in datasets.items():
    if dataset_name.endswith('_test'):
        continue
    print(" " * 100)
    print("=" * 100)
    print(f"top 10 similar pair of tweets for the {dataset_name} dataset:")
    print("=" * 100)
    print(" " * 100)
    top_similar_tweets[dataset_name] = find_top_n_similar_tweets(vectorized_data)


                                                                                                    
top 10 similar pair of tweets for the stemming_binary dataset:
                                                                                                    
Tweet indices : (678, 1196), Similarity: 0.8819171036881966
Tweet 1 : i,thanks recent follow happy connect happy great thursday.
Tweet 2: get free?,thanks recent follow happy connect happy great thursday.
--------------------------------------------------
Tweet indices : (490, 678), Similarity: 0.8819171036881966
Tweet 1 : happy,thanks recent follow happy connect happy great thursday.,thank much willow!
Tweet 2: i,thanks recent follow happy connect happy great thursday.
--------------------------------------------------
Tweet indices : (160, 1011), Similarity: 0.8819171036881966
Tweet 1 : know cute happy,sone cant stream genie!you stream ty genie!
Tweet 2: happy,sone cant stream genie!you stream ty genie!
--------------------

## sentiment analysis
now i can use some of the different datasets that i prepared for sentiment analysis using different algorithms 

datasets used for the sentiment analysis task : 
- stemming_binary 
- lemmatization_tfidf
- stemming_misspelling_correction_word_counts
- lemmatization_misspelling_correction_binary
- lemmatization_misspelling_correction_tf-idf

In [133]:
import joblib
import os

## first i'll start by defining a function that saves the model and a function that loads the model

def save_model(model, folder_name, file_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # save the model
    model_path = os.path.join(folder_name, file_name)
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

def load_model(folder_name, file_name):
    model_path = os.path.join(folder_name, file_name)
    model = joblib.load(model_path)
    print(f"Model loaded from {model_path}")
    return model

In [134]:
# defining the datasets that i want to use
selected_datasets = [
    "stemming_binary",
    "lemmatization_tf-idf",
    "stemming_misspelling_correction_word_counts", 
    "lemmatization_misspelling_correction_binary", 
    "lemmatization_misspelling_correction_tf-idf"
    ]

## Sentiment analysis using Logistic regression

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [136]:
# a dict to store the metrics for Logistic regression model across different datasets
Logistic_regression_metrics = {}

In [137]:
for dataset_name in selected_datasets:
    # load train and test data
    X_train, y_train = datasets[dataset_name]
    X_test, y_test = datasets[dataset_name + "_test"]

    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)

    y_pred = lr_model.predict(X_test)

    # calculate accuracy metrics
    accuracy = accuracy_score(y_test, y_pred)

    Logistic_regression_metrics[dataset_name] = {
        "accuracy": accuracy
    }

    save_model(lr_model, 'lr_models', f"lr_{dataset_name}_model.joblib")

# print results
for dataset_name, metrics in Logistic_regression_metrics.items():
    print(f"\nMetrics for {dataset_name}:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

Model saved to lr_models/lr_stemming_binary_model.joblib
Model saved to lr_models/lr_lemmatization_tf-idf_model.joblib
Model saved to lr_models/lr_stemming_misspelling_correction_word_counts_model.joblib
Model saved to lr_models/lr_lemmatization_misspelling_correction_binary_model.joblib
Model saved to lr_models/lr_lemmatization_misspelling_correction_tf-idf_model.joblib

Metrics for stemming_binary:
Accuracy: 0.8958

Metrics for lemmatization_tf-idf:
Accuracy: 0.8827

Metrics for stemming_misspelling_correction_word_counts:
Accuracy: 0.8958

Metrics for lemmatization_misspelling_correction_binary:
Accuracy: 0.8925

Metrics for lemmatization_misspelling_correction_tf-idf:
Accuracy: 0.8827


## Sentiment analysis using SVM

In [138]:
from sklearn.svm import SVC

In [139]:
# a dict to store the metrics for SVM model across different datasets
SVM_metrics = {}

In [140]:
for dataset_name in selected_datasets:
    # load train and test data
    X_train, y_train = datasets[dataset_name]
    X_test, y_test = datasets[dataset_name + "_test"]

    # train the SVM model
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train, y_train)

    # predicting with SVM
    svm_predictions = svm_model.predict(X_test)

    # calculat model metrics
    accuracy = accuracy_score(y_test, svm_predictions)

    SVM_metrics[dataset_name] = {
        "accuracy" : accuracy
    }

    save_model(svm_model, 'svm_models', f"svm_{dataset_name}_model.joblib")

    
# print results
for dataset_name, metrics in SVM_metrics.items():
    print(f"\nMetrics for {dataset_name}:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

Model saved to svm_models/svm_stemming_binary_model.joblib
Model saved to svm_models/svm_lemmatization_tf-idf_model.joblib
Model saved to svm_models/svm_stemming_misspelling_correction_word_counts_model.joblib
Model saved to svm_models/svm_lemmatization_misspelling_correction_binary_model.joblib
Model saved to svm_models/svm_lemmatization_misspelling_correction_tf-idf_model.joblib

Metrics for stemming_binary:
Accuracy: 0.8893

Metrics for lemmatization_tf-idf:
Accuracy: 0.9088

Metrics for stemming_misspelling_correction_word_counts:
Accuracy: 0.8925

Metrics for lemmatization_misspelling_correction_binary:
Accuracy: 0.8827

Metrics for lemmatization_misspelling_correction_tf-idf:
Accuracy: 0.9088


## Sentiment analysis using random forest classifier

In [141]:
from sklearn.ensemble import RandomForestClassifier

In [142]:
# a dict to store the metrics for random forest model across different datasets
random_forest_metrics = {}

In [143]:
for dataset_name in selected_datasets:
    # load train and test data
    X_train, y_train = datasets[dataset_name]
    X_test, y_test = datasets[dataset_name + "_test"]

    # train the random forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # predicting with random Forest
    rf_predictions = rf_model.predict(X_test)

    # calculat model metrics
    accuracy = accuracy_score(y_test, rf_predictions)

    random_forest_metrics[dataset_name] = {
        "accuracy" : accuracy
    }

    save_model(rf_model, 'rf_models', f"rf_{dataset_name}_model.joblib")

    
# print results
for dataset_name, metrics in random_forest_metrics.items():
    print(f"\nMetrics for {dataset_name}:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

Model saved to rf_models/rf_stemming_binary_model.joblib
Model saved to rf_models/rf_lemmatization_tf-idf_model.joblib
Model saved to rf_models/rf_stemming_misspelling_correction_word_counts_model.joblib
Model saved to rf_models/rf_lemmatization_misspelling_correction_binary_model.joblib
Model saved to rf_models/rf_lemmatization_misspelling_correction_tf-idf_model.joblib

Metrics for stemming_binary:
Accuracy: 0.8697

Metrics for lemmatization_tf-idf:
Accuracy: 0.8599

Metrics for stemming_misspelling_correction_word_counts:
Accuracy: 0.8567

Metrics for lemmatization_misspelling_correction_binary:
Accuracy: 0.8567

Metrics for lemmatization_misspelling_correction_tf-idf:
Accuracy: 0.8599


### evaluating the model predictions

In [202]:
label_mapping = {1: "Positive", 0: "Neutral", -1: "Negative"}

def preprocess_and_vectorize(text, vectorizer):
    vectorized_text = vectorizer.transform([text])
    return vectorized_text

# defining a function to predict a sentiment
def predict_sentiment(text, load_model, vectorizer, label_mapping):
    # Preprocess and vectorize input text
    vectorized_text = preprocess_and_vectorize(text, vectorizer)
    
    # Predict the sentiment
    prediction = load_model.predict(vectorized_text)[0]
    print(prediction)
    
    # Map the prediction to the sentiment label
    sentiment = label_mapping.get(prediction, "Unknown")
    return sentiment

positive_sample_text = "The children joyfully play in the sunny park while their parents smile. One child loves running and catching butterflies, feeling free and happy under the bright sky."
negative_sample_text = "The children sit alone in the gloomy park, looking bored and unhappy. One child sighs, saying he hates being stuck inside, while another complains about not having anyone to play with."
neutral_sample_text = "The children are playing in the park while their parents watch. One child talks about running and catching butterflies, while another is sitting and observing the surroundings."
preprocessing_func = preprocessing_methods['lemmatization_misspelling_correction']
preprocessed_text = preprocessing_func(positive_sample_text)
vectorizer = vectorizers['lemmatization_misspelling_correction_binary']
loaded_model = load_model('svm_models', f"svm_lemmatization_misspelling_correction_binary_model.joblib")
# loaded_model = load_model('lr_models', f"lr_lemmatization_misspelling_correction_binary_model.joblib")
# loaded_model = load_model('rf_models', f"rf_lemmatization_misspelling_correction_binary_model.joblib")
predicted_sentiment = predict_sentiment(positive_sample_text, loaded_model, vectorizer, label_mapping)
print(f"The predicted sentiment is: {predicted_sentiment}")


Model loaded from svm_models/svm_lemmatization_misspelling_correction_binary_model.joblib
1
The predicted sentiment is: Positive
