In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Data/reviews_badminton/data.csv')
df.shape

(8518, 8)

In [3]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
df.dropna(inplace=True)

In [5]:
df = df[['Review Title','Review text','Ratings']]

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df['Ratings'] = df['Ratings'].apply(lambda x: 1 if x >= 4 else 0)

In [8]:
df['Review'] = df['Review Title'] +' '+ df['Review text']

In [9]:
df = df[['Review','Ratings']]

In [10]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = text.lower()

    text = re.sub(r'[^A-Za-z]',' ', text)

    token = word_tokenize(text)

    words = [i for i in token if i not in stopwords.words("english")]

    processed = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(processed)

In [11]:
df

Unnamed: 0,Review,Ratings
0,"Nice product Nice product, good quality, but p...",1
1,Don't waste your money They didn't supplied Yo...,0
2,Did not meet expectations Worst product. Damag...,0
5,Mind-blowing purchase Good quality product. De...,1
6,Must buy! BEST PURCHASE It is a good quality a...,1
...,...,...
8495,Nice Thanks to the delivery boy ... Service is...,1
8496,Good choice Over priced even after 50% discoun...,0
8497,Awesome Too much priced. It was getting me for...,0
8499,High cost Hii flipkart customers care..why you...,1


In [12]:
tempdf = df['Review'].apply(preprocessing)

In [13]:
df= pd.concat([df,tempdf],axis=1)

In [14]:
df.head()

Unnamed: 0,Review,Ratings,Review.1
0,"Nice product Nice product, good quality, but p...",1,nice product nice product good quality price r...
1,Don't waste your money They didn't supplied Yo...,0,waste money supplied yonex mavis outside cover...
2,Did not meet expectations Worst product. Damag...,0,meet expectation worst product damaged shuttle...
5,Mind-blowing purchase Good quality product. De...,1,mind blowing purchase good quality product del...
6,Must buy! BEST PURCHASE It is a good quality a...,1,must buy best purchase good quality durable av...


In [15]:
df.columns = ['Review','Ratings','Reviews']

In [16]:
xtrain, xtest, ytrain, ytest = train_test_split(df['Reviews'],df['Ratings'], test_size=0.4 , random_state=42)

In [17]:
import mlflow
mlflow.set_experiment('Sentiment Analysis Experiment')

<Experiment: artifact_location='file:///D:/Project%20Files/Jupyter%20NoteBook/DataScience%20Internship/7.%20Flipkart%20Text%20Sentiment%20Deployment/mlruns/833338031240615773', creation_time=1714170657325, experiment_id='833338031240615773', last_update_time=1714170657325, lifecycle_stage='active', name='Sentiment Analysis Experiment', tags={}>

In [18]:
import mlflow.sklearn

In [19]:
pipe1 = Pipeline([
    ("vectorization", CountVectorizer()),
    ("classifier", MultinomialNB())
])

parameter_grid_1 = [
    {
        'vectorization': [CountVectorizer()],
        'vectorization__max_features': [100, 150],
        'classifier__alpha': [10, 20]
    }
]

clf = GridSearchCV(
    estimator=pipe1,
    param_grid=parameter_grid_1,
    scoring='accuracy',
    return_train_score=True,
    verbose=1
)

In [20]:
import time,os,joblib
with mlflow.start_run() as run:
    start_time = time.time()
    clf.fit(xtrain, ytrain)
    end_time = time.time()
    
    # Log parameters, metrics, and model
    mlflow.log_params(clf.best_params_)
    mlflow.log_metric("mean_accuracy", clf.best_score_)
    mlflow.sklearn.log_model(clf.best_estimator_, "model")
    
    # Log elapsed time
    mlflow.log_metric("training_time", end_time - start_time)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [21]:
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
    
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ]),

    'random_forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ])
}

param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features': [1000, 2000],
            'classifier__alpha': [1, 10, 20]
        }
    ],
    
    'decision_tree': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features': [500, 1000, 2000],
            'classifier__max_depth': [None, 5, 10, 20, 50],
        }
    ],
    
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features': [1000, 2000],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['elasticnet'],
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],

    'random_forest': [
        {
            'vectorization__max_features': [1000, 2000],
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 50],
        }
    ]
}



In [22]:
best_models = {}
for key in pipelines.keys():
    grid_search = GridSearchCV(estimator=pipelines[key], 
                               param_grid=param_grids[key], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1)

    start_fit_time = time.time()
    grid_search.fit(xtrain, ytrain)
    end_fit_time = time.time()

    start_predict_time = time.time()
    y_pred = grid_search.predict(xtest)
    end_predict_time = time.time()

    if not os.path.exists('best_models'):
        os.makedirs('best_models')

    joblib.dump(grid_search.best_estimator_, f'best_models/{key}.pkl')
    model_size = os.path.getsize(f'best_models/{key}.pkl')

    # Print log
    print("Train Score: ", grid_search.best_score_)
    print("Test Score:", grid_search.score(xtest, ytest))
    print('Fit Time:', end_fit_time - start_fit_time)
    print('Predict Time:', end_predict_time - start_predict_time)
    print("Model size:", model_size)
    print("." * 15)
    

    with mlflow.start_run() as run:
        mlflow.set_tag("developer", "Aravinth")
        
        # Log parameters with mlflow.log_param()
        mlflow.log_param('algorithm', key)
        mlflow.log_param('hyperparameter_grid', param_grids[key])
        mlflow.log_param('best_hyperparameter', grid_search.best_params_)
        
        # Log metrics with mlflow.log_metrics()
        mlflow.log_metric('train_score', grid_search.best_score_)
        mlflow.log_metric('test_score', grid_search.score(xtest, ytest))
        mlflow.log_metric('fit_time', end_fit_time - start_fit_time)
        mlflow.log_metric('predict_time', end_predict_time - start_predict_time)
        mlflow.log_metric('model_size', model_size)
        
        # Log model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f'{key}_model')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Train Score:  0.8352015353894219
Test Score: 0.8302169035153328
Fit Time: 3.322333335876465
Predict Time: 0.01588273048400879
Model size: 76170
...............




Fitting 5 folds for each of 30 candidates, totalling 150 fits
Train Score:  0.8272245910752384
Test Score: 0.8178758414360509
Fit Time: 18.81926155090332
Predict Time: 0.017499685287475586
Model size: 69644
...............




Fitting 5 folds for each of 36 candidates, totalling 180 fits




Train Score:  0.8097685425291068
Test Score: 0.8066566940912491
Fit Time: 209.42925357818604
Predict Time: 0.04059338569641113
Model size: 69229
...............




Fitting 5 folds for each of 24 candidates, totalling 120 fits
Train Score:  0.8431815852647336
Test Score: 0.8350785340314136
Fit Time: 267.49485421180725
Predict Time: 0.07535958290100098
Model size: 9170228
...............


