## loading datasets
i'll start by loading the datasets into pandas dataframes, and then i'll perform some basic cleaning and preprocessing.

In [2]:
# import the necessary libraries
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize
# define the path to the data
path_to_data = '../data/raw/'

In [3]:
# read the csv files content
with open(os.path.join(path_to_data, 'processedPositive.csv'), 'r') as file:
    happy_content = file.read()
with open(os.path.join(path_to_data, 'processedNegative.csv'), 'r') as file:
    sad_content = file.read()
with open(os.path.join(path_to_data, 'processedNeutral.csv'), 'r') as file:
    neutral_content = file.read()

# split the content into individual tweets
happy_sentences = sent_tokenize(happy_content)
happy_tweets = pd.DataFrame(happy_sentences, columns=['text'])
happy_tweets['sentiment'] = 1
sad_sentences = sent_tokenize(sad_content)
sad_tweets = pd.DataFrame(sad_sentences, columns=['text'])
sad_tweets['sentiment'] = -1
neutral_sentences = sent_tokenize(neutral_content)
neutral_tweets = pd.DataFrame(neutral_sentences, columns=['text'])
neutral_tweets['sentiment'] = 0

print(happy_tweets.head())
print(happy_tweets.shape)
print(sad_tweets.head())
print(sad_tweets.shape)
print(neutral_tweets.head())
print(neutral_tweets.shape)


                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(587, 2)
                                                text  sentiment
0  How unhappy  some dogs like it though,talking ...         -1
1  I got some money  I need to change into R but ...         -1
2  unhappy ,it's that A*dy guy from pop Asia and ...         -1
3                            Is this how I find out.         -1
4                                Everyone knows now.         -1
(357, 2)
                                                text  sentiment
0  Pak PM survives removal scare, but court order...          0
1  ,Supreme Court quashes criminal complaint agai...          0
2  ,FCRA slap on NGO f

##  merge and clean the data
in this step we'll clean the data by removing the duplicates and stop words, which are often meaningless words that can add noise to the dataset rather than meaningful information

In [4]:
merged_tweets = pd.concat([happy_tweets, sad_tweets, neutral_tweets], ignore_index=True)
merged_tweets.dropna(inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)

                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(1628, 2)


In [35]:
import nltk

# transform the text to lowercase
merged_tweets['text'] = merged_tweets['text'].str.lower()

nltk.download('stopwords')
# remove stop words
merged_tweets['text'] = merged_tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# remove duplicates
merged_tweets.drop_duplicates(subset=['text'], inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrooma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  sentiment
0                       inspiration aspects: fashion          1
1                                            fitness          1
2        beauty personality. :)kisses thefashionicon          1
3  apka apna awam ka channel frankline tv aam adm...          1
4  beautiful album greatest unsung guitar genius ...          1
(3395, 2)


## split the data with stratification
making a split on the train and test (20%) datasets with stratification.

In [6]:
from sklearn.model_selection import train_test_split

train_tweets, test_tweets = train_test_split(merged_tweets,
                                             test_size=0.2,
                                             stratify=merged_tweets['sentiment'],
                                             random_state=42)

print(train_tweets.shape)
print(test_tweets.shape)

(1302, 2)
(326, 2)


In [8]:
# check the distribution of the sentiment in the original, train and test datasets
print("Original dataset:")
print(merged_tweets['sentiment'].value_counts(normalize=True))
print("Train dataset:")
print(train_tweets['sentiment'].value_counts(normalize=True))
print("Test dataset:")
print(test_tweets['sentiment'].value_counts(normalize=True))



Original dataset:
sentiment
 0    0.420147
 1    0.360565
-1    0.219287
Name: proportion, dtype: float64
Train dataset:
sentiment
 0    0.420123
 1    0.360215
-1    0.219662
Name: proportion, dtype: float64
Test dataset:
sentiment
 0    0.420245
 1    0.361963
-1    0.217791
Name: proportion, dtype: float64


## preprocess and vectorize the data

in this step i'll prepare multiple datasets, each dataset is a combination of a preprocessing approach and a vectorization approach.

-- preprocessing approaches :
- text tokenization
- stemming
- lemmatization
- stemming + mispellings correction
- lemmatization + mispellings correction

-- vectorization approaches :
- binary vectorization
- word counts
- tf-idf
- word2vec