In [1]:
# Standard imports
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('product_reviews.csv')
df.head(2)

Unnamed: 0,reviewer_name,reviewer_rating,review_title,review_text,place_of_review,date_of_review,up_votes,down_votes,product_name,sentiment
0,Kamal Suresh,4.0,Nice product,"Nice product, good quality, but price is now r...","Certified Buyer, Chirakkal",Feb 2021,889.0,64.0,badminton,1
1,Flipkart Customer,1.0,Don't waste your money,They didn't supplied Yonex Mavis 350. Outside ...,"Certified Buyer, Hyderabad",Feb 2021,109.0,6.0,badminton,0


In [3]:
from sklearn.model_selection import train_test_split

# Split Data
X = df['review_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=0)

In [4]:
print(X_train.shape, X_test.shape)

(8604,) (2151,)


In [5]:
import mlflow

mlflow.set_experiment("sentiment_prediction")

2024/03/28 11:47:24 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/USER/Documents/Data%20Science%20and%20Analytics/InnomaticsResearchLabs/sentiment_analysis/mlruns/301106958472313625', creation_time=1711622844923, experiment_id='301106958472313625', last_update_time=1711622844923, lifecycle_stage='active', name='sentiment_prediction', tags={}>

In [6]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
# initialise the Stemmer
stemmer = PorterStemmer()

In [8]:
# initialise Lemmatizer
lemmatizer = WordNetLemmatizer()

## Data Preprocessing on train data

In [9]:
def preprocess(raw_text):
    
    # Removing special characters and digits

    sentence = re.sub("[^a-zA-Z]|READ MORE", " ", raw_text)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = nltk.word_tokenize(sentence)
    
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]
    
    # Join and return
    return " ".join(cleaned_tokens)

In [10]:
# sample of processed word
preprocess(df['review_text'][0])

'nice product good quality price rising bad sign wa affordable price especially play everyday kindly help u term price thank'

## Converting Text to Numerical vectors - BOW Representation

In [11]:
# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectorizer
vect = CountVectorizer(preprocessor=preprocess)
%time
X_train_bow = vect.fit_transform(X_train)
print(X_train_bow.shape)

CPU times: total: 0 ns
Wall time: 0 ns
(8604, 2554)


In [12]:
# transform testing data (using training data's features)
%time
X_test_bow = vect.transform(X_test)
print(X_test_bow.shape)


CPU times: total: 0 ns
Wall time: 0 ns
(2151, 2554)


## **Auto Logging Naive Bayes Demo Experiment Run using MLFlow**


In [13]:
# import classifier from sklearn
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model

mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time
    nb.fit(X_train_bow, y_train) # train the model(timing it with an IPython "magic command")



CPU times: total: 0 ns
Wall time: 0 ns


## **Create an optimal workflow**

In [14]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [15]:
# Define the pipeline with caching
pipe = Pipeline(
    [
        ('vectorization', CountVectorizer()),
        ('nb', MultinomialNB())
    ]
)

MAX_FEATURES = [1000, 1500, 2000]
ALPHA = [1, 10]

# Observe the Key Value Pair format
parameter_grid = [
    {
        'vectorization__preprocessor': [preprocess],
        'vectorization__max_features': MAX_FEATURES,
        'nb__alpha': ALPHA
    }
]

clf = GridSearchCV(
    estimator=pipe,
    param_grid=parameter_grid,
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time
    clf.fit(X_train, y_train)



CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [16]:
# Improving the efficiency by applying cleaning the text data before hand

%time
X_train_clean = X_train.apply(lambda raw_txt: preprocess(raw_txt))

CPU times: total: 0 ns
Wall time: 0 ns


In [17]:
%time
X_test_clean = X_test.apply(lambda raw_txt: preprocess(raw_txt))

CPU times: total: 0 ns
Wall time: 0 ns


## **Auto Logging All Experiment Runs using MLFlow**

In [18]:
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced'],
        }
    ],
    'random_forest': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10],
            'classifier__n_estimators': [10,20,25],
            'classifier__min_samples_leaf': [2],
            'classifier__bootstrap': [True,False],
            'classifier__class_weight': ['balanced']
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algorithm in pipelines.keys():
    print("*"*10, algorithm, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algorithm], 
                               param_grid=param_grids[algorithm], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time
        grid_search.fit(X_train_clean, y_train)
    
    best_models[algorithm] = grid_search.best_estimator_
    
    print('Train Score: ', grid_search.best_score_)
    print('Score on Test Data: ', grid_search.score(X_test_clean, y_test))

********** naive_bayes **********




CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train Score:  0.9341006445684634
Score on Test Data:  0.9265457926545793
********** logistic_regression **********




CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train Score:  0.9120177560369175
Score on Test Data:  0.905625290562529
********** random_forest **********




CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Train Score:  0.908531005499777
Score on Test Data:  0.9065550906555091


In [19]:
# Stop the auto logger

mlflow.sklearn.autolog(disable=True)

## **Custom Experiment Tracking and Database Integration with MLFlow**

In [20]:
import time
import joblib
import os

In [21]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Sentiment Prediction")

<Experiment: artifact_location=('file:///c:/Users/USER/Documents/Data Science and '
 'Analytics/InnomaticsResearchLabs/sentiment_analysis/mlruns/1'), creation_time=1711369401488, experiment_id='1', last_update_time=1711369401488, lifecycle_stage='active', name='Sentiment Prediction', tags={}>

In [22]:
dev = "Aminat Owodunni"
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )

    # Fit
    start_fit_time = time.time()
    grid_search.fit(X_train_clean, y_train)
    end_fit_time = time.time()

    # Predict
    start_predict_time = time.time()
    y_pred = grid_search.predict(X_test_clean)
    end_predict_time = time.time()

    # Saving the best model
    joblib.dump(grid_search.best_estimator_, f'best_models/{algo}.pkl')
    model_size = os.path.getsize(f'best_models/{algo}.pkl')

    # Pring Log
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test_clean, y_test))
    print("Fit Time: ", end_fit_time - start_fit_time)
    print("Predict Time: ", end_predict_time - start_predict_time)
    print("Model Size: ", model_size)
    
    print()

    # Start the experiment run
    with mlflow.start_run() as run:
        # Log tags with mlflow.set_tag()
        mlflow.set_tag("developer", dev)

        # Log Parameters with mlflow.log_param()
        mlflow.log_param("algorithm", algo)
        mlflow.log_param("hyperparameter_grid", param_grids[algo])
        mlflow.log_param("best_hyperparameter", grid_search.best_params_)

        # Log Metrics with mlflow.log_metric()
        mlflow.log_metric("train_score", grid_search.best_score_)
        mlflow.log_metric("test_score", grid_search.score(X_test_clean, y_test))
        mlflow.log_metric("fit_time", end_fit_time - start_fit_time)
        mlflow.log_metric("predict_time", end_predict_time - start_predict_time)
        mlflow.log_metric("model_size", model_size)

        # Log Model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{algo}_model")

********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train Score:  0.9341006445684634
Test Score:  0.9265457926545793
Fit Time:  4.472436904907227
Predict Time:  0.01397705078125
Model Size:  78490

********** logistic_regression **********
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train Score:  0.9120177560369175
Test Score:  0.9060901906090191
Fit Time:  759.0083088874817
Predict Time:  0.01399087905883789
Model Size:  84301

********** random_forest **********
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Train Score:  0.906788238314663
Test Score:  0.8744769874476988
Fit Time:  193.5282428264618
Predict Time:  0.029982328414916992
Model Size:  938274

