## loading datasets
i'll start by loading the datasets into pandas dataframes, and then i'll perform some basic cleaning and preprocessing.

In [4]:
# import the necessary libraries
import pandas as pd
import os
# define the path to the data
path_to_data = '../data/raw/'

In [21]:
# read the csv files content
with open(os.path.join(path_to_data, 'processedPositive.csv'), 'r') as file:
    happy_content = file.read()
with open(os.path.join(path_to_data, 'processedNegative.csv'), 'r') as file:
    sad_content = file.read()
with open(os.path.join(path_to_data, 'processedNeutral.csv'), 'r') as file:
    neutral_content = file.read()

# split the content into individual tweets
happy_tweets = pd.DataFrame(happy_content.split(','), columns=['text'])
happy_tweets['sentiment'] = 1
sad_tweets = pd.DataFrame(sad_content.split(','), columns=['text'])
sad_tweets['sentiment'] = -1
neutral_tweets = pd.DataFrame(neutral_content.split(','), columns=['text'])
neutral_tweets['sentiment'] = 0

print(happy_tweets.head())
print(happy_tweets.shape)
print(sad_tweets.head())
print(sad_tweets.shape)
print(neutral_tweets.head())
print(neutral_tweets.shape)


                                                text  sentiment
0             An inspiration in all aspects: Fashion          1
1                                            fitness          1
2    beauty and personality. :)KISSES TheFashionIcon          1
3  Apka Apna Awam Ka Channel Frankline Tv Aam Adm...          1
4  Beautiful album from  the greatest unsung guit...          1
(1186, 2)
                                                text  sentiment
0              How unhappy  some dogs like it though         -1
1  talking to my over driver about where I'm goin...         -1
2  Does anybody know if the Rand's likely to fall...         -1
3         I miss going to gigs in Liverpool unhappy          -1
4      There isnt a new Riverdale tonight ? unhappy          -1
(1117, 2)
                                                text  sentiment
0                      Pak PM survives removal scare          0
1   but court orders further probe into corruptio...          0
2  Supreme Court qua

##  merge and clean the data
in this step we'll clean the data by removing the duplicates and stop words, which are often meaningless words that can add noise to the dataset rather than meaningful information

In [22]:
merged_tweets = pd.concat([happy_tweets, sad_tweets, neutral_tweets], ignore_index=True)
merged_tweets.dropna(inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)

                                                text  sentiment
0             An inspiration in all aspects: Fashion          1
1                                            fitness          1
2    beauty and personality. :)KISSES TheFashionIcon          1
3  Apka Apna Awam Ka Channel Frankline Tv Aam Adm...          1
4  Beautiful album from  the greatest unsung guit...          1
(3873, 2)


In [35]:
import nltk

# transform the text to lowercase
merged_tweets['text'] = merged_tweets['text'].str.lower()

nltk.download('stopwords')
# remove stop words
merged_tweets['text'] = merged_tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# remove duplicates
merged_tweets.drop_duplicates(subset=['text'], inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrooma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  sentiment
0                       inspiration aspects: fashion          1
1                                            fitness          1
2        beauty personality. :)kisses thefashionicon          1
3  apka apna awam ka channel frankline tv aam adm...          1
4  beautiful album greatest unsung guitar genius ...          1
(3395, 2)


## split the data with stratification
making a split on the train and test (20%) datasets with stratification.

In [37]:
from sklearn.model_selection import train_test_split

train_tweets, test_tweets = train_test_split(merged_tweets,
                                             test_size=0.2,
                                             stratify=merged_tweets['sentiment'],
                                             random_state=42)

print(train_tweets.shape)
print(test_tweets.shape)

(2716, 2)
(679, 2)


## preprocess and vectorize the data

in this step i'll prepare multiple datasets, each dataset is a combination of a preprocessing approach and a vectorization approach.

preprocessing approaches : 