In [1]:
# import libraries
import pandas as pd

import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer


# download nltk corpus (first time only)
import nltk

nltk.download('all')




# Load the amazon review dataset

df = pd.read_excel(r"C:\Users\nidhi\Downloads\Copy of Ayurvedic_medicine_final.xlsx")

df

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\nidhi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\nidhi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\nidhi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\nidhi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\nidhi\AppData\Roaming\nltk_data...
[

Unnamed: 0,Customer ID,Product,Website,Commend,Label,Disease
0,1,Gurmar,Amazon,Great sir,1.0,Diabetes
1,2,Gurmar,Amazon,Thank You so much,1.0,Diabetes
2,3,Gurmar,Amazon,Very helpful,1.0,Diabetes
3,4,Gurmar,Amazon,Best medicine,1.0,Diabetes
4,5,Gurmar,Amazon,Best Ayurvedic medicine,1.0,Diabetes
5,6,Gurmar,Amazon,Amazing result,1.0,Diabetes
6,7,Gurmar,Amazon,Recommended,1.0,Diabetes
7,8,Gurmar,Amazon,Would definitely recommend,1.0,Diabetes
8,9,Gurmar,Amazon,Good,1.0,Diabetes
9,10,Gurmar,Amazon,Now under control,1.0,Diabetes


In [2]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text

    tokens = word_tokenize(text.lower())




    # Remove stop words

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]




    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]




    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function df

df['Commend'] = df['Commend'].apply(preprocess_text)
df

Unnamed: 0,Customer ID,Product,Website,Commend,Label,Disease
0,1,Gurmar,Amazon,great sir,1.0,Diabetes
1,2,Gurmar,Amazon,thank much,1.0,Diabetes
2,3,Gurmar,Amazon,helpful,1.0,Diabetes
3,4,Gurmar,Amazon,best medicine,1.0,Diabetes
4,5,Gurmar,Amazon,best ayurvedic medicine,1.0,Diabetes
5,6,Gurmar,Amazon,amazing result,1.0,Diabetes
6,7,Gurmar,Amazon,recommended,1.0,Diabetes
7,8,Gurmar,Amazon,would definitely recommend,1.0,Diabetes
8,9,Gurmar,Amazon,good,1.0,Diabetes
9,10,Gurmar,Amazon,control,1.0,Diabetes


In [3]:
# initialize NLTK sentiment analyzer

analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function

def get_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['pos'] > 0 else 0

    return sentiment




# apply get_sentiment function

df['sentiment'] = df['Commend'].apply(get_sentiment)

df

Unnamed: 0,Customer ID,Product,Website,Commend,Label,Disease,sentiment
0,1,Gurmar,Amazon,great sir,1.0,Diabetes,1
1,2,Gurmar,Amazon,thank much,1.0,Diabetes,1
2,3,Gurmar,Amazon,helpful,1.0,Diabetes,1
3,4,Gurmar,Amazon,best medicine,1.0,Diabetes,1
4,5,Gurmar,Amazon,best ayurvedic medicine,1.0,Diabetes,1
5,6,Gurmar,Amazon,amazing result,1.0,Diabetes,1
6,7,Gurmar,Amazon,recommended,1.0,Diabetes,1
7,8,Gurmar,Amazon,would definitely recommend,1.0,Diabetes,1
8,9,Gurmar,Amazon,good,1.0,Diabetes,1
9,10,Gurmar,Amazon,control,1.0,Diabetes,0


In [5]:
# Round the 'Label' column to the nearest integer
df['Label'] = df['Label'].round()


In [7]:
# Check for non-finite values (NaN, inf) in 'Label' column
non_finite_values = df['Label'][~df['Label'].apply(pd.to_numeric, errors='coerce').notnull()]
print(non_finite_values)


397    NaN
2032   NaN
Name: Label, dtype: float64


In [8]:
# Replace NaN with a default value, such as 0
df['Label'].fillna(0, inplace=True)

# Replace infinite values with a large number or 0
df['Label'].replace([float('inf'), float('-inf')], 0, inplace=True)


In [9]:
# Convert 'Label' column to integers
df['Label'] = df['Label'].astype(int)


In [10]:
# Print specific rows, e.g., the first 10 rows
print(df.head(10))


   Customer ID Product Website                     Commend  Label   Disease  \
0            1  Gurmar  Amazon                   great sir      1  Diabetes   
1            2  Gurmar  Amazon                  thank much      1  Diabetes   
2            3  Gurmar  Amazon                     helpful      1  Diabetes   
3            4  Gurmar  Amazon               best medicine      1  Diabetes   
4            5  Gurmar  Amazon     best ayurvedic medicine      1  Diabetes   
5            6  Gurmar  Amazon              amazing result      1  Diabetes   
6            7  Gurmar  Amazon                 recommended      1  Diabetes   
7            8  Gurmar  Amazon  would definitely recommend      1  Diabetes   
8            9  Gurmar  Amazon                        good      1  Diabetes   
9           10  Gurmar  Amazon                     control      1  Diabetes   

   sentiment  
0          1  
1          1  
2          1  
3          1  
4          1  
5          1  
6          1  
7         

In [11]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['Label'], df['sentiment']))

[[   0  213  169]
 [   0   49  126]
 [   0  275 1420]]


In [12]:
from sklearn.metrics import classification_report

print(classification_report(df['Label'], df['sentiment']))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       382
           0       0.09      0.28      0.14       175
           1       0.83      0.84      0.83      1695

    accuracy                           0.65      2252
   macro avg       0.31      0.37      0.32      2252
weighted avg       0.63      0.65      0.64      2252



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
