In [1]:
import pandas as pd
# download data
data = pd.read_csv('Amazon Review Data Web Scrapping - Amazon Review Data Web Scrapping.csv')


In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Training a baseline model
## Extracting features

In [8]:
from preprocessing import preprocess_text

In [9]:

processed = data['Review_text'].head(200).apply(preprocess_text) # apply the preprocess function to the first 200 rows of the dataframe 
processed

0                                                  liked
1      bought phone amazon using samsung m30s couple ...
2                 awesome book reasonable price must buy
3                                                   good
4      book fine bad contains nice concept nicely exp...
                             ...                        
195    gaming laptop snd still hang working photoshop...
196                                          mobile good
197    brought broken piece led panel o physically da...
198    back camer good full smooth battery life also ...
199                                                 good
Name: Review_text, Length: 200, dtype: object

In feature extraction, we take the existing data and transform it into a different representation that is more suitable for machine learning algorithms. This is why I choose to use TF-IDF, because it converts text data into numerical features by assigning each word in a document a numerical value based on its frequency and importance within the document and across a collection of documents,like in this case the entire dataset with all the reviews. This process doesn't create new features; it just transforms the original text data into a format that machine learning algorithms can work with, so it's not considered feature engineering but simply feature extraction which is why I am using it at this point of the analysis.
When it comes to the choice of TF-IDF specifically in feature extraction, the reason I chose TF-IDF instead of another process is because, in my analysis I want to identify words that carry more sentiment or importance, therefore using TF-IDF is a better choice for me, as using BoW only helps know the most common words in the reviews which is not enough for the sentiment analysis  I want to make.


In [None]:
tfidf = TfidfVectorizer() #  TDIDF vectorizer 
processed_vectors = tfidf.fit_transform(processed) #fit and transform
processed_vectors

<200x1006 sparse matrix of type '<class 'numpy.float64'>'
	with 2358 stored elements in Compressed Sparse Row format>

In [None]:
tfidf_df = pd.DataFrame(processed_vectors.toarray()) # create a dataframe of vectors  
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,996,997,998,999,1000,1001,1002,1003,1004,1005
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# use imshow to plot the vector representations
color_continuous_scale='RdBu_r'
px.imshow(tfidf_df, color_continuous_scale='RdBu_r', title="Representation of [200 x 1561] TF-IDF encodings")

## Label encoding and train/test split

First I need to use a label encoder, its main role is to convert text labels into numeric values, which is necessary in my analysis because the machine learning algorithms I am going to use only accept numerical data as input.

In [None]:
from sklearn.preprocessing import LabelEncoder
# Instantiate a label encoder
label_encoder = LabelEncoder()
# Fit and transform the encoder on labels
data['sentiment_enc'] = label_encoder.fit_transform(data['Own_Rating'])
# Now  we have a new column 'sentiment_enc' with encoded labels
data['sentiment_enc']


0        2
1        2
2        2
3        2
4        1
        ..
60884    2
60885    2
60886    2
60887    1
60888    1
Name: sentiment_enc, Length: 60889, dtype: int32

Before implementing a pipeline and splitting the we have to deal first with the missing values, otherwise implementing the pipeline will result in errors. As we noticed when exploring the data we have 28 missing values in the 'Review_text' column which means that there are 28 rows in the dataset where the 'Review_text' column does not have a value. 
Since there is only 28 missing reviews, which neglectable compared to the quantity of reviews in general and in each class of sentiments, deleating the rows with those missing reviws wouldn't have an influence on the analysis, that is why I chose to delete those rows.

In [None]:

missing_review_index = data[data['Review_text'].isnull()].index
missing_review_index


Int64Index([  655,  2869,  9021, 11244, 14885, 16434, 17801, 18679, 19425,
            26621, 29111, 29130, 29480, 30882, 35089, 36111, 40466, 40629,
            44107, 44866, 45377, 45443, 45940, 47607, 49696, 53490, 55664,
            59210],
           dtype='int64')

In [None]:

indices_to_remove = [655, 2869, 9021, 11244, 14885, 16434, 17801, 18679, 19425, 26621, 29111, 29130, 29480, 30882, 35089, 36111, 40466, 40629, 44107, 44866, 45377, 45443, 45940, 47607, 49696, 53490, 55664, 59210]
# Remove rows with specified indices
data = data.drop(indices_to_remove)


In [None]:
# Rechecking for missing values to make sure that the column Review_text has no longer any missing values.
print(data.isnull().sum())

Unique_ID             0
Category              0
Review_Header         1
Review_text           0
Rating                0
Own_Rating            0
sentiments_numeric    0
sentiment_enc         0
dtype: int64


In [None]:
# Split the data into training and testing sets
X = data['Review_text']    
y = data['sentiment_enc']
test_size=0.2 
random_state=42

# use the train_test_split function with the above test size and random seed
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

 MultinomialNB  is used for text classification tasks, especially when text data is represented as word counts (Bag of Words) or term frequencies (TF-IDF), this is why I choose to use it in this analysis. 

In [None]:
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(preprocessor=preprocess_text)),
    ('classifier', MultinomialNB())
])
text_clf.fit(X_train, y_train)

In [None]:
y_pred = text_clf.predict(X_test) # Use the pipeline to run predictions on the test data

## Visualisation of the results

Now that we implemented the pipeline and the model, lets view the results and test different models to choose the best one.

In [None]:
class_labels = label_encoder.inverse_transform(range(3))

confusion_matrix_kwargs = dict(
    text_auto=True, 
    title="Confusion Matrix", width=1000, height=800,
    labels=dict(x="Predicted", y="True Label"),
    x=class_labels,
    y=class_labels,
    color_continuous_scale='Blues'
)

def report(y_true, y_pred, class_labels):
    print(classification_report(y_true, y_pred, target_names=class_labels))
    # print a classification report of the predictions 
    # create a confusion matrix and pass it to imshow to visualize it 
    # (the confusion_matrix_kwargs are here for styling only)
    confusion_matrix_data = confusion_matrix(y_true, y_pred) 
    fig = px.imshow(
        confusion_matrix_data , 
        **confusion_matrix_kwargs
        )
    fig.show()

In [None]:
class_labels = label_encoder.inverse_transform(range(3)) #because I have 3 sentiments negative, positive and neutral
# call the report function to visualize the classification result using the above class_labels
report(y_test, y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.71      0.66      0.68      1800
     Neutral       0.27      0.02      0.03       937
    Positive       0.87      0.97      0.92      9436

    accuracy                           0.85     12173
   macro avg       0.62      0.55      0.54     12173
weighted avg       0.80      0.85      0.81     12173



Now that we have visualised the results, let's try with TF-IDF. But first I am creating a class that with make it clearer and easier to facilateiteration, this will also give us the ability to also experiment with different models easier.

In [None]:
class Model:
    def __init__(self, X, y, model_architecture, vectorizer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.random_seed = random_seed
        self.test_size = test_size

        self.pipeline = Pipeline([
            ('vectorizer', self.vectorizer),
            ('model', self.model_instance)
        ]) # the pipeline as defined previously

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_seed
        ) # train test split using the above X, y, test_size and random_state

        # Assign the training and testing data to the class attributes
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def preprocess(self, text):
        def remove_url(text):
            url_pattern = r'https?://\S+|www\.\S+'
            cleaned_text = re.sub(url_pattern, '', text)

            return cleaned_text
        
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        cleaned_text = remove_url(text)
        tokens = word_tokenize(cleaned_text)
        words = [word.lower() for word in tokens if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    
    def fit(self):
        # fit self.pipeline to the training data
        self.pipeline.fit(self.X_train, self.y_train)

    def predict(self):
        return self.pipeline.predict(self.X_test)

    
    def predict_proba(self):
        return self.pipeline.predict_proba(self.X_test)

        
    
    def report(self, y_true, y_pred, class_labels):
        # the report function as defined previously
        # Print the classification report
        print(classification_report(y_true, y_pred, target_names=class_labels))

        # Create a confusion matrix
        confusion_matrix_data = confusion_matrix(y_true, y_pred)

        # Define the confusion matrix styling
        confusion_matrix_kwargs = dict(
            text_auto=True,
            title="Confusion Matrix",
            width=1000,
            height=800,
            labels=dict(x="Predicted", y="True Label"),
            x=class_labels,
            y=class_labels,
            color_continuous_scale='Blues'
        )

        # Create a heatmap of the confusion matrix
        fig = px.imshow(
            confusion_matrix_data,
            **confusion_matrix_kwargs
        )
        fig.show()

At first we are trying with TF-IDF and keeping the MultinomialNB model that I used earlier when creating the pipeline.

In [None]:
# instantiate the Model class with text and labels (X and y), a multinomial naive bayes model and a tfidf vectorizer
model = Model(X, y, MultinomialNB(), TfidfVectorizer(preprocessor=preprocess_text
                                                     ), random_seed=42, test_size=0.2)
# fit the model
model.fit()

# predict and generate classification report
y_pred = model.predict()
class_labels=label_encoder.inverse_transform(range(3))
report(y_test, y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.86      0.33      0.47      1800
     Neutral       0.00      0.00      0.00       937
    Positive       0.82      0.99      0.90      9436

    accuracy                           0.82     12173
   macro avg       0.56      0.44      0.46     12173
weighted avg       0.76      0.82      0.77     12173




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



We notice that when using the TF-IDF the performance of the model decreases, this can be due to the fact that TF-IDF vectorization doesn't consider the order of words in the text. It treats each document as a bag of words, which might not be suitable for tasks where word order and context are important like the case of sentiment analysis.

This time, I am changing the model I use Logistic Regression.

In [None]:
# Instantiate the Model class with text and labels (X and y), a Logistic Regression model, and a TF-IDF vectorizer
model = Model(X, y, LogisticRegression(max_iter=1000), TfidfVectorizer(preprocessor=preprocess_text), random_seed=42, test_size=0.2)
# Fit the model
model.fit()
# Predict using the model
y_pred = model.predict()
# Generate and display the classification report
model.report(model.y_test, y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.75      0.65      0.70      1800
     Neutral       0.36      0.03      0.06       937
    Positive       0.87      0.97      0.92      9436

    accuracy                           0.85     12173
   macro avg       0.66      0.55      0.56     12173
weighted avg       0.82      0.85      0.82     12173



Logistic regression is a more complex model than MultinomialNB. It has more parameters to learn and can model more complex decision boundaries. This is an advantage when the data is more complex, thismight be an explanation to why itsgiving a better result.

This time I try another model RandomForestClassifier.

In [None]:
model = Model(X, y, RandomForestClassifier(), TfidfVectorizer(preprocessor=preprocess_text), random_seed=42, test_size=0.2)
# Fit the model
model.fit()
# Predict using the model
y_pred = model.predict()
# Generate and display the classification report
model.report(model.y_test, y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.75      0.59      0.66      1800
     Neutral       0.26      0.01      0.03       937
    Positive       0.86      0.98      0.92      9436

    accuracy                           0.84     12173
   macro avg       0.62      0.53      0.53     12173
weighted avg       0.80      0.84      0.81     12173



We notice that the models have a problem with predicting especially the neutral category. We also notice that we have way less neutral reviews in our dataset compared to the positive and the negative ones, this is the reason for the problems with the prediction of the neutral class.


## Improve on the baseline results

We notice that the LogisticRegression model has the best results when it comes to the f1 scores in all the classes (positive, negative, neutral) compared to the other models we tested, this is why I am using it for the rest of the analysis.

## SMOTE
Since we are dealing with imbalaced data, there is an unequal distribution among the classes, it would be a good idea to try Over-sampling (SMOTE) to improve the baseline results.
The SMOTE (Synthetic Minority Over-sampling Technique) technique is used to over-sample the training data ('X_train', 'y_train') in order to balance the classes more effectively. This is useful when classes are imbalanced, which is exactly what we have in this case, as it generates synthetic samples of the minority class.

In [None]:
data['Review_text'] = data['Review_text'].astype(str)


In [None]:
processed_all_data=data['Review_text'].apply(preprocess_text) #process all the data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Preprocess text data and convert to TF-IDF vectors
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(processed_all_data)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
# Apply SMOTE to balance classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Train the model (Logistic Regression) on the resampled data
model = LogisticRegression(random_state=42, max_iter=1000) 
model.fit(X_train_resampled, y_train_resampled)
# Make predictions
y_pred = model.predict(X_test)
# Classification report
class_labels = label_encoder.inverse_transform(range(3))
print(classification_report(y_test, y_pred, target_names=class_labels))


              precision    recall  f1-score   support

    Negative       0.61      0.71      0.66      1800
     Neutral       0.16      0.35      0.22       937
    Positive       0.93      0.80      0.86      9436

    accuracy                           0.75     12173
   macro avg       0.57      0.62      0.58     12173
weighted avg       0.83      0.75      0.78     12173



After applying the SMOTE method to balance the dataset and training a logistic regression model, we notice the model's performance has improved for the minority class "Neutral" (compared to the previous result). The recall for the "Neutral" class has increased from 0.03 to 0.35, that means that the model is better at identifying instances of the "Neutral" class. Even though the precision has decreased,  but the the harmonic mean of precision and recall (f1 score) has increased, which means that the overall performance has improved when it comes to predicting the neutral class.
However, it's important to note that improving the performance for the class "Neutral" had a negative impact on the f1 scores of other classes. The trade-off between different classes is common when dealing with imbalanced datasets.
In this analysis, I consider it an overall improved of the model because even if the accuracy decreased, its better than having a model that is really unable to predict a specific cass, the neutral one in this case, even if the performance with the other classes decreased.

## Hyperparameter tuning
Another method that I am trying is  hyperparameter tuning using Randomized Search Cross-Validation with the LogisticRegression model. I am using this method and combining it to the SMOTE method since I am using the X_train_resampled and y_train_resampled and I created earlier with the smote method.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

# Define hyperparameter grid for logistic regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
    'max_iter': [100, 200, 300, 400],  # Maximum number of iterations
    'solver': ['liblinear'],  
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    LogisticRegression(random_state=42),
    param_distributions=param_grid,
    n_iter=10,  
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,  # Use all available CPU cores for parallelization
    random_state=42,
    verbose=2,
    scoring='accuracy'
)

# Fit the RandomizedSearchCV object to your data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 100, 'C': 100}


Now that we know the best hyperparameters, let's optimise the model.

In [None]:
from sklearn.linear_model import LogisticRegression

# Using the best hyperparameters to create the improved LogisticRegression model
best_params = {
    'solver': 'liblinear',
    'penalty': 'l1',
    'max_iter': 100,
    'C': 100
}

improved_model = LogisticRegression(**best_params, random_state=42)
# Fit the improved model on the resampled data
improved_model.fit(X_train_resampled, y_train_resampled)


In [None]:
from sklearn.metrics import classification_report
# Make predictions using the improved model
y_pred_improved = improved_model.predict(X_test)
# Classification report
class_labels = label_encoder.inverse_transform(range(3))  
print(classification_report(y_test, y_pred, target_names=class_labels))


              precision    recall  f1-score   support

    Negative       0.61      0.71      0.66      1800
     Neutral       0.16      0.35      0.22       937
    Positive       0.93      0.80      0.86      9436

    accuracy                           0.75     12173
   macro avg       0.57      0.62      0.58     12173
weighted avg       0.83      0.75      0.78     12173



We notice that there isn't an improvement. 

In [None]:
from sklearn.linear_model import LogisticRegression

# Using the best hyperparameters to create the improved LogisticRegression model
best_params = {
    'solver': 'liblinear',
    'penalty': 'l1',
    'max_iter': 100,
    'C': 100
}

improved_model = LogisticRegression(**best_params, random_state=42)
# Fit the improved model on the resampled data
improved_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report
# Make predictions using the improved model
y_pred_improved = improved_model.predict(X_test)
# Classification report
class_labels = label_encoder.inverse_transform(range(3))  
print(classification_report(y_test, y_pred, target_names=class_labels))

              precision    recall  f1-score   support

    Negative       0.62      0.72      0.67      1800
     Neutral       0.19      0.41      0.26       937
    Positive       0.94      0.80      0.87      9436

    accuracy                           0.76     12173
   macro avg       0.58      0.64      0.60     12173
weighted avg       0.84      0.76      0.79     12173



When using hyper parameter tuning parameter method without using the SMOTE before, the results are slightly better. This can happen because, using SMOTE to oversample the minority class introduces synthetic data points, which may not fully capture the characteristics of the original data. In some cases, this could lead to a slightly lower performance compared to using the original data

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(improved_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy:", cross_val_scores.mean())


Cross-Validation Accuracy: 0.8300504020700343


## Applying class weights
We noticed (when using the SMOTE method) that resolving the problem of umbalanced classes was the most effective to improve the results. Another way to resolve the problem of umbalanced classes (different from the smote one that I used earlier) is the use of class weights, as it gives the model a way to account for the differences in class frequencies and make better predictions.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)

# Create a dictionary with class weights
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Create the LogisticRegression model with class weights
model = LogisticRegression(random_state=42, class_weight=class_weight_dict, max_iter=1000)

# Train your model on the resampled data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Generate a classification report
class_labels = label_encoder.inverse_transform(range(3))  
print(classification_report(y_test, y_pred, target_names=class_labels))


              precision    recall  f1-score   support

    Negative       0.61      0.71      0.66      1800
     Neutral       0.16      0.35      0.22       937
    Positive       0.93      0.80      0.86      9436

    accuracy                           0.75     12173
   macro avg       0.57      0.62      0.58     12173
weighted avg       0.83      0.75      0.78     12173



We notice that the results haven't changed. 


I will use the same method but using  and X_train_resampled and y_train_resampled wich means using the data without applying the SMOTE method before :

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Create a dictionary with class weights
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Create the LogisticRegression model with class weights
model = LogisticRegression(random_state=42, class_weight=class_weight_dict, max_iter=1000)

# Train your model on the resampled data
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate a classification report
class_labels = label_encoder.inverse_transform(range(3))  
print(classification_report(y_test, y_pred, target_names=class_labels))


              precision    recall  f1-score   support

    Negative       0.62      0.72      0.67      1800
     Neutral       0.19      0.41      0.26       937
    Positive       0.94      0.80      0.87      9436

    accuracy                           0.76     12173
   macro avg       0.58      0.64      0.60     12173
weighted avg       0.84      0.76      0.79     12173



We notice that f1 scores have slightly improved, this means that this method in this analysis is better that the SMOTE method to balance the classes.

# Use Tensorflow and train a sequence model of my choice.

I choosed  to use LSTM neural network for many reasons. LSTM networks are good at capturing sequential and contextual information, which makes them ideal for NLP tasks with dependencies and contextual connections among words. The model also incorporates word embeddings to represent words in a continuous vector space, this is important because it helps with finding semantic relationships within the text. In addition to that, deep learning models like LSTM are very good at detecting non-linear patterns within data, and this can be usefull when the text exhibits complex relationships between words or phrases. Plus, the model works well for multi-class sentiment classification, using the softmax activation function in the output layer, which is important for tasks with multiple sentiment categories like the case of this analysis.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)  
tokenizer.fit_on_texts(processed_all_data)  # Use the preprocessed data
X_sequences = tokenizer.texts_to_sequences(processed_all_data)  # Use preprocessed data
X_padded = pad_sequences(X_sequences, maxlen=200) 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build an LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))  # Using softmax because we are working with multi-class sentiment

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# One-hot encode the target labels
y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test_onehot = tf.keras.utils.to_categorical(y_test, num_classes=3)

# Train the model
model.fit(X_train, y_train_onehot, epochs=5, batch_size=32) 

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = y_pred_prob.argmax(axis=1)  # Convert probabilities to class labels

print(classification_report(y_test, y_pred))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.70      0.66      0.68      1800
           1       0.20      0.12      0.15       937
           2       0.89      0.94      0.91      9436

    accuracy                           0.83     12173
   macro avg       0.60      0.57      0.58     12173
weighted avg       0.81      0.83      0.82     12173

