## loading datasets
i'll start by loading the datasets into pandas dataframes, and then i'll perform some basic cleaning and preprocessing.

In [1]:
# import the necessary libraries
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize
# define the path to the data
path_to_data = '../data/raw/'

In [2]:
# read the csv files content
with open(os.path.join(path_to_data, 'processedPositive.csv'), 'r') as file:
    happy_content = file.read()
with open(os.path.join(path_to_data, 'processedNegative.csv'), 'r') as file:
    sad_content = file.read()
with open(os.path.join(path_to_data, 'processedNeutral.csv'), 'r') as file:
    neutral_content = file.read()

# split the content into individual tweets
happy_sentences = sent_tokenize(happy_content)
happy_tweets = pd.DataFrame(happy_sentences, columns=['text'])
happy_tweets['sentiment'] = 1
sad_sentences = sent_tokenize(sad_content)
sad_tweets = pd.DataFrame(sad_sentences, columns=['text'])
sad_tweets['sentiment'] = -1
neutral_sentences = sent_tokenize(neutral_content)
neutral_tweets = pd.DataFrame(neutral_sentences, columns=['text'])
neutral_tweets['sentiment'] = 0

print(happy_tweets.head())
print(happy_tweets.shape)
print(sad_tweets.head())
print(sad_tweets.shape)
print(neutral_tweets.head())
print(neutral_tweets.shape)


                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(587, 2)
                                                text  sentiment
0  How unhappy  some dogs like it though,talking ...         -1
1  I got some money  I need to change into R but ...         -1
2  unhappy ,it's that A*dy guy from pop Asia and ...         -1
3                            Is this how I find out.         -1
4                                Everyone knows now.         -1
(357, 2)
                                                text  sentiment
0  Pak PM survives removal scare, but court order...          0
1  ,Supreme Court quashes criminal complaint agai...          0
2  ,FCRA slap on NGO f

##  merge and clean the data
in this step we'll clean the data by removing the duplicates and stop words, which are often meaningless words that can add noise to the dataset rather than meaningful information

In [3]:
merged_tweets = pd.concat([happy_tweets, sad_tweets, neutral_tweets], ignore_index=True)
merged_tweets.dropna(inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)

                                                text  sentiment
0  An inspiration in all aspects: Fashion, fitnes...          1
1  :)KISSES TheFashionIcon,Apka Apna Awam Ka Chan...          1
2  Can you donate?,Omg he... kissed... him crying...          1
3  love love happy,thanks happy,C'mon Tweeps, Joi...          1
4                                Do spread the word.          1
(1628, 2)


In [4]:
import nltk
from nltk.corpus import stopwords

# transform the text to lowercase
merged_tweets['text'] = merged_tweets['text'].str.lower()

nltk.download('stopwords')
# remove stop words
merged_tweets['text'] = merged_tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# remove duplicates
merged_tweets.drop_duplicates(subset=['text'], inplace=True)
merged_tweets.reset_index(drop=True, inplace=True)

print(merged_tweets.head())
print(merged_tweets.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrooma/miniconda3/envs/tweets/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  sentiment
0  inspiration aspects: fashion, fitness, beauty ...          1
1  :)kisses thefashionicon,apka apna awam ka chan...          1
2  donate?,omg he... kissed... crying joy,happy a...          1
3  love love happy,thanks happy,c'mon tweeps, joi...          1
4                                       spread word.          1
(1533, 2)


## split the data with stratification
making a split on the train and test (20%) datasets with stratification.

In [5]:
from sklearn.model_selection import train_test_split

train_tweets, test_tweets = train_test_split(merged_tweets,
                                             test_size=0.2,
                                             stratify=merged_tweets['sentiment'],
                                             random_state=42)

print(train_tweets.shape)
print(test_tweets.shape)

(1226, 2)
(307, 2)


In [6]:
# check the distribution of the sentiment in the original, train and test datasets
print("Original dataset:")
print(merged_tweets['sentiment'].value_counts(normalize=True))
print("Train dataset:")
print(train_tweets['sentiment'].value_counts(normalize=True))
print("Test dataset:")
print(test_tweets['sentiment'].value_counts(normalize=True))



Original dataset:
sentiment
 0    0.397913
 1    0.371168
-1    0.230920
Name: proportion, dtype: float64
Train dataset:
sentiment
 0    0.398042
 1    0.371126
-1    0.230832
Name: proportion, dtype: float64
Test dataset:
sentiment
 0    0.397394
 1    0.371336
-1    0.231270
Name: proportion, dtype: float64


## preprocess and vectorize the data

in this step i'll prepare multiple datasets, each dataset is a combination of a preprocessing approach and a vectorization approach.

-- preprocessing approaches :
- text tokenization
- stemming
- lemmatization
- stemming + mispellings correction
- lemmatization + mispellings correction

-- vectorization approaches :
- binary vectorization
- word counts
- tf-idf

In [10]:
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

# import the preprocessing and vectorization functions
from utils.text_preprocessing import tokenize, stem_text, lemmatize_text, correct_spelling
from utils.text_vectorization import binary_vectorizer, count_vectorizer, tfidf_vectorizer


In [15]:
# dictionnary to store the datasets
datasets = {}

# preprocessing and vectorization dictionary
preprocessing_methods = {
    'stemming' : stem_text,
    'lemmatization' : lemmatize_text,
    'stemming_misspelling_correction' : lambda x: correct_spelling(stem_text(x)),
    'lemmatization_misspelling_correction' : lambda x: correct_spelling(lemmatize_text(x))
}

vectorization_methods = {
    'binary' : binary_vectorizer,
    'word_counts' : count_vectorizer,
    'tf-idf' : tfidf_vectorizer
}


In [16]:
for prep_name, prep_func in preprocessing_methods.items():
    # preprocess the train and test data
    train_preprocessed = train_tweets['text'].apply(prep_func)
    test_preprocessed = test_tweets['text'].apply(prep_func)

    for vec_name, vec_fund in vectorization_methods.items():
        train_vectorized, vectorizer = vec_fund(train_preprocessed)
        test_vectorized = vectorizer.transform(test_preprocessed)

        # store the datasets with labels
        datasets[f'{prep_name}_{vec_name}'] = (train_vectorized, train_tweets['sentiment'])
        datasets[f'{prep_name}_{vec_name}_test'] = (test_vectorized, test_tweets['sentiment'])


print(datasets.keys())

dict_keys(['stemming_binary', 'stemming_binary_test', 'stemming_word_counts', 'stemming_word_counts_test', 'stemming_tf-idf', 'stemming_tf-idf_test', 'lemmatization_binary', 'lemmatization_binary_test', 'lemmatization_word_counts', 'lemmatization_word_counts_test', 'lemmatization_tf-idf', 'lemmatization_tf-idf_test', 'stemming_misspelling_correction_binary', 'stemming_misspelling_correction_binary_test', 'stemming_misspelling_correction_word_counts', 'stemming_misspelling_correction_word_counts_test', 'stemming_misspelling_correction_tf-idf', 'stemming_misspelling_correction_tf-idf_test', 'lemmatization_misspelling_correction_binary', 'lemmatization_misspelling_correction_binary_test', 'lemmatization_misspelling_correction_word_counts', 'lemmatization_misspelling_correction_word_counts_test', 'lemmatization_misspelling_correction_tf-idf', 'lemmatization_misspelling_correction_tf-idf_test'])
