<a href="https://colab.research.google.com/github/ajij2021160085/CSE-412/blob/main/Amazon_Dot_Reviews_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
sid321axn_amazon_alexa_reviews_path = kagglehub.dataset_download('sid321axn/amazon-alexa-reviews')

print('Data source import complete.')


In [None]:
# PRELIMINARIES

In [None]:
'''

link: https://www.kaggle.com/sid321axn/amazon-alexa-reviews/home

This dataset consists of a nearly 3000 Amazon customer reviews
(input text), star ratings, date of review, variant and feedback
of various amazon Alexa products like Alexa Echo, Echo dots,
Alexa Firesticks etc. for learning how to train Machine for
sentiment analysis.

You can use this data to analyze Amazon’s Alexa product;
discover insights into consumer reviews and assist with machine learning
models.You can also train your machine models for sentiment analysis and
analyze customer reviews how many positive reviews ?
and how many negative reviews ?

Extracted from Amazon's website
'''

In [None]:
import pandas as pd, numpy as np
PATH = "/kaggle/input/combinedataset-all-device/combined_smart_devices_sentiment.csv"
raw_data = pd.read_csv(PATH)

In [None]:
pd.set_option('display.max_colwidth', -1)
raw_data.head()

In [None]:
# Mapping ratings to sentiment classes
# 5, 4 -> Positive (1), 1, 2 -> Negative (0), 3 -> Neutral (2)
raw_data['sentiment'] = raw_data['ratings'].map(lambda x: 1 if x in [5, 4] else 0)

# Verifying the updated dataset
raw_data.head()

In [None]:
raw_data


In [None]:
raw_data.ratings.value_counts()

In [None]:
# Remove Rating = 3 categories as they are likely to confuse the model.
raw_data['response'] = 0
raw_data.loc[raw_data['ratings'].isin([4,5]), 'response'] = 1
raw_data.loc[raw_data['ratings'] == 3, 'response'] = 2
bad_reviews = raw_data[raw_data['response'] == 0]
good_reviews = raw_data[raw_data['response'] == 1]
all_reviews = good_reviews.append(bad_reviews)

In [None]:
data = all_reviews[['response', 'verified_reviews']]
data.columns = ['response', 'text']

In [None]:
# EXPLORATORY DATA ANALYSIS

In [None]:
pd.set_option('display.max_colwidth', -1)
data.head()

In [None]:
data.shape

In [None]:
# Event Rate
data.response.value_counts()

In [None]:
# Check for Nulls
data.isnull().sum()

In [None]:
# Check Data Types
data['text'].astype('str')
data.dtypes

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# SYNTACTICAL FEATURES (PHYSICAL DESCRIPTIONS)

In [None]:
# 1. Size Measurements

# no. of characters (also text size)
data['char_cnt'] = data['text'].str.len()

# no. of words
data['word_cnt'] = data['text'].apply(lambda x: len(str(x).split()))

# no. of sentences
data['sentence_cnt'] = data['text'].apply(lambda x: len(str(x).split(". ")))

data[['text', 'char_cnt', 'word_cnt', 'sentence_cnt']].head()

In [None]:
# 2. Derived Ratios

# avg. word size
data['avg_word_size'] = data['char_cnt']/data['word_cnt']

# avg. sentence size
data['avg_char_per_sent'] = data['char_cnt']/data['sentence_cnt']

# avg. words per sentence
data['avg_word_per_sent'] = data['word_cnt']/data['sentence_cnt']

data[['text','avg_word_size','avg_char_per_sent', 'avg_word_per_sent']].head()

In [None]:
# 3. Stopwords/filler Words

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# no. of stopwords
data['stop_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

# no. of stopwords in every sentence
data['avg_stop_per_sent'] = data['stop_cnt']/data['sentence_cnt']

# no. of stopwords to total words
data['avg_stop_per_word'] = data['stop_cnt']/data['word_cnt']

data[['text','stop_cnt', 'avg_stop_per_sent', 'avg_stop_per_word']].head()

In [None]:
# 4. Counts of Key Characters/Words

# no. of hashtags
data['hash_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

# no. of @tags
data['tag_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))

# no. of exclamations!
data['excl_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.endswith('!')]))

# no. of questions?
data['ques_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.endswith('?')]))

# no. of numeric chars
data['num_cnt'] = data['text'].apply(lambda x: sum(i.isdigit() for i in x))

# no. of uppercase words (SHOUTING?)
data['upper_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

data[['text','hash_cnt', 'tag_cnt', 'excl_cnt','ques_cnt', 'num_cnt', 'upper_cnt']].head()

In [None]:
# 5. Counts of Parts of Speech (POS Counts)

# tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
data['word_tokens'] = data['text'].apply(word_tokenize)

# number of nouns
data['noun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("NN", "NNS", "NNP", "NNPS")]))
data['proper_noun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("NNP", "NNPS")]))

# number of pronouns
data['pronoun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("PRP", "PRP$")]))
data['wh_pronoun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("WP", "WP$")]))
data['pronoun_tot_cnt'] = data['pronoun_cnt'] + data['wh_pronoun_cnt']

# number of adjectives
data['adj_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("JJ","JJR", "JJS")]))

# number of verbs
data['verb_past_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VBP", "VBZ","VBG")]))
data['verb_present_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VBD", "VBN")]))
data['verb_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VB")]))
data['verb_tot_cnt'] =  data['verb_past_cnt'] + data['verb_present_cnt'] + data['verb_cnt']

# number of adverbs
data['adverb_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("RB", "RBR", "RBS")]))

# number of modals
data['modal_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("MD")]))

# number of foreign words
data['foreign_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("FW")]))

# number of determiners
data['det_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("DET")]))

# number of conjunctions
data['cc_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("CC")]))
data['in_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("IN")]))
data['ccin_cnt'] = data['cc_cnt'] + data['in_cnt']

data[['text','noun_cnt', 'proper_noun_cnt',
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt']].head()

In [None]:
# Prepare the Data

features = ['response','char_cnt', 'word_cnt', 'sentence_cnt', 'avg_word_size','avg_char_per_sent',
                  'avg_word_per_sent', 'stop_cnt', 'avg_stop_per_sent', 'avg_stop_per_word',
                  'hash_cnt', 'tag_cnt', 'excl_cnt','ques_cnt', 'num_cnt', 'upper_cnt',
           'noun_cnt', 'proper_noun_cnt',
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt']
data_temp = data[features]
data_temp = data_temp.dropna()
data_temp.head()

In [None]:
data_temp.shape

In [None]:
!pip install catboost

In [None]:
# PRE PROCESSING FOR ADVANCED FEATURES

In [None]:
# lowercase all
data['text_clean_v1'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data[['text', 'text_clean_v1']].head()

In [None]:
# remove punctuation
data['text_clean_v2'] = data['text_clean_v1'].str.replace('[^\w\s]','')
data[['text', 'text_clean_v1', 'text_clean_v2']].head()

In [None]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['text_clean_v3'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data[['text','text_clean_v3']].head()

In [None]:
'''
# correct spelling (takes time)
from textblob import TextBlob
data['text_clean_v4'] = data['text_clean_v3'].apply(lambda x: str(TextBlob(x).correct()))
data[['text_clean_v3', 'text_clean_v4']].head()
'''

In [None]:
# remove rare words
rare_words = pd.Series(' '.join(data['text_clean_v3']).split()).value_counts()[-2500:]
rare_words.head()
rare_words_list = list(rare_words.index)
data['text_clean_v5'] = data['text_clean_v3'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_words_list))

In [None]:
# stemming
#import nltk
nltk.download('stem')
from nltk.stem import PorterStemmer
from textblob import Word
data['text_clean_v6'] = data['text_clean_v5'].apply(lambda x: " ".join([PorterStemmer().stem(word) for word in x.split()]))
data[['text', 'text_clean_v6']].head()

In [None]:
# lemmatization
import nltk
nltk.download('wordnet')
from textblob import Word
data['text_clean_v6'] = data['text_clean_v3'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data[['text', 'text_clean_v6']].head()

In [None]:
# to check the effects of text processing just undo it and proceed, then compare result
# TLDR: it doesn't help much!
data['text_clean_v6'] = data['text']

In [None]:
# ADVANCED FEATURE ENGINEERING

In [None]:
# Physical Features

# no. of characters (also text size)
data['char_cnt'] = data['text_clean_v6'].str.len()

# no. of words
data['word_cnt'] = data['text_clean_v6'].apply(lambda x: len(str(x).split()))

# no. of sentences
data['sentence_cnt'] = data['text_clean_v6'].apply(lambda x: len(str(x).split(". ")))

# avg. word size
data['avg_word_size'] = data['char_cnt']/data['word_cnt']

# avg. sentence size
data['avg_char_per_sent'] = data['char_cnt']/data['sentence_cnt']

# avg. words per sentence
data['avg_word_per_sent'] = data['word_cnt']/data['sentence_cnt']

In [None]:
# TERM FREQUENCY/WORD COUNTS

In [None]:
# find common words
common_words = pd.Series(' '.join(data['text_clean_v6']).split()).value_counts()[0:100]
common_words.head(10)

In [None]:
# Assuming `data['text_clean_v6']` contains the cleaned text data

# Count the most common words
common_words = pd.Series(' '.join(data['text_clean_v6']).split()).value_counts().head(10)

# Plotting the top 10 most common words
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
common_words.plot(kind='bar', color='skyblue', alpha=0.7)
plt.title('Top 10 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Counts of Negative/Positive Words

negative_words = ['bad', 'horrible', 'sad','wrong','no','worst','worse',
                  'not', 'terrible', 'okay', 'sorrow', 'kill', 'negative', 'empty',
                 'hollow', 'poor', 'upset', 'why', 'unfair', 'eliminate','not',
                  'never', 'hate', 'dislike']

positive_words = ['good', 'great', 'awesome', 'happy', 'joy', 'enjoy', 'use', 'useful',
                  'wonder', 'wonderful', 'love', 'like', 'amazing',
                  'thanks', 'thank','hey', 'haha','nice', 'cool', 'lol',
                 'right', 'yeah', 'fun', 'well', 'enjoyable', 'crazy', 'super', 'kickass']

data['neg_word_cnt'] = data['text_clean_v6'].apply(lambda x: len([x for x in x.split() if x.lower() in negative_words]))
data['pos_word_cnt'] = data['text_clean_v6'].apply(lambda x: len([x for x in x.split() if x.lower() in positive_words]))

# derived ratios
data['neg_word_cnt_ratio1'] = data['neg_word_cnt']/data['word_cnt']
data['neg_word_cnt_ratio2'] = data['neg_word_cnt']/data['sentence_cnt']
data['pos_word_cnt_ratio1'] = data['pos_word_cnt']/data['word_cnt']
data['pos_word_cnt_ratio2'] = data['pos_word_cnt']/data['sentence_cnt']

new_features_2 = ['neg_word_cnt', 'pos_word_cnt','neg_word_cnt_ratio1','neg_word_cnt_ratio2','pos_word_cnt_ratio1','pos_word_cnt_ratio2']

data[['text_clean_v6'] + new_features_2].head()

In [None]:
# Count Vectors (Single Word)

from sklearn.feature_extraction.text import CountVectorizer
num_features = 50
vectorizer = CountVectorizer(ngram_range=(1,1),
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))


# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
count_vector_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)


In [None]:
# Count Vectors (2-Gram)

from sklearn.feature_extraction.text import CountVectorizer
num_features = 25
vectorizer = CountVectorizer(ngram_range=(2,2),
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))

# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
count_vector_2gram_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)


In [None]:
# TFIDF Vectors

from sklearn.feature_extraction.text import TfidfVectorizer
num_features = 50
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))


# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
tfidf_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)


In [None]:
#!pip install afinn

In [None]:
'''
from afinn import Afinn

def Afinn_apply(var):
    afinn = Afinn(emoticons=True)
    return afinn.score(var)

data['Afinn'] = data['text_clean_v6'].apply(Afinn_apply)
data[['text_clean_v6', 'Afinn']].head()
'''

In [None]:
 from textblob import TextBlob

def TextBlobPolarity(var):
  testimonial = TextBlob(var)
  return testimonial.sentiment.polarity

def TextBlobSubjectivity(var):
  testimonial = TextBlob(var)
  return testimonial.sentiment.subjectivity

data['TextBlobSubjectivity'] = data['text_clean_v6'].apply(TextBlobSubjectivity)
data['TextBlobPolarity'] = data['text_clean_v6'].apply(TextBlobPolarity)
data['TextBlobSubPol_Interaction'] = data['TextBlobSubjectivity']*data['TextBlobPolarity']
data[['text_clean_v6', 'TextBlobPolarity', 'TextBlobSubjectivity']].head()

In [None]:
# Prepare the Data

data_temp = data[['response','char_cnt', 'word_cnt', 'sentence_cnt', 'avg_word_size','avg_char_per_sent',
                  'avg_word_per_sent','TextBlobSubPol_Interaction',
                 'TextBlobPolarity', 'TextBlobSubjectivity',
                  'noun_cnt', 'proper_noun_cnt',
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt']
                 + count_vector_features
                 + tfidf_features
                 + count_vector_2gram_features
                 + new_features_2]
data_temp = data_temp.dropna()
data_temp.drop_duplicates(keep = 'first',inplace = True)
data_temp.head()

In [None]:
# Train Test Split

X = data_temp.drop('response', axis = 1)
y = data_temp['response'].astype('int')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=20)

In [None]:
#Using the inverse of the class frequency to balance .. Let's what the output comes -Azizur

#Accuracy decreased not good !!!

#from sklearn.utils.class_weight import compute_class_weight
#class_weights_val = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)


In [None]:
#class_weights

In [None]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(X_train, y_train, cat_features = np.where(X.dtypes == 'object')[0])
test_pool = Pool(X_test, y_test, cat_features = np.where(X.dtypes == 'object')[0])
model = CatBoostClassifier(random_state = 1, eval_metric='AUC', use_best_model = True, verbose = 200, class_weights = [1,3] )
model.fit(train_pool, eval_set = test_pool)

In [None]:
#CAT FEATURE IMPORTANCE

feature_importance = model.get_feature_importance(train_pool)
feature_names = X_train.columns
feature_imp = pd.DataFrame([feature_names, feature_importance])
final = feature_imp.transpose()
final.sort_values(by = 1, ascending = False, inplace = True)
pd.set_option('display.max_colwidth', -1)
final.head(10)

In [None]:
# Assuming `feature_names` and `feature_importance` are already extracted
import pandas as pd
import matplotlib.pyplot as plt

# Create the feature importance DataFrame
feature_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the features by importance
feature_imp = feature_imp.sort_values(by='Importance', ascending=False)

# Plotting the feature importance graph
plt.figure(figsize=(10, 6))
plt.barh(feature_imp['Feature'][:10], feature_imp['Importance'][:10], color='skyblue')
plt.gca().invert_yaxis()  # Invert y-axis to display the most important feature at the top
plt.title('Top 10 Most Important Features')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()


In [None]:
# RESULTS
probs = model.predict_proba(test_pool)
pred = np.where(probs[:,1] > 0.90, 1, 0)

print('Predicted Class and Probabilities: \n')
print(pred[:5]) # predicted class
print(probs[:5]) # probability scores

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
print('\nAccuracy: ', str(accuracy_score(y_test, pred)))
print('Precision: ', str(precision_score(y_test, pred)))
print('Recall: ', str(recall_score(y_test, pred)))
print('F1: ', str(f1_score(y_test, pred)))
print('Area under ROC Curve: ', str(roc_auc_score(y_test, probs[:,1])))
print('GINI: ', str(-1 + 2*roc_auc_score(y_test, probs[:,1])))

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

print('\nTrue Negatives: ', str(tn))
print('True Positives: ', str(tp))
print('False Negatives: ', str(fn))
print('False Positives: ', str(fp))

print('\nTotal Reviews: ', str(tn+fp+fn+tp))
print('Reviews Predicted as Negative: ', str(fn+tn))
print('Total Negative Reviews in Actuality: ', str(fp+tn))
print('Negative Reviews that were Correctly Predicted: ', str(tn))

#Aziz Here THe second model works batter due to using class weights[1,2] which 1 to the majority class and 2 to the minority class ... This makes the modle pay
#much more attention to minortiy class penalizing any error.  #The previous model we didn't account class imbalance

#I


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test, pred)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming `probs` contains the probability scores and `example_ids` are for the first 5 examples
example_ids = range(1, 6)  # Example IDs for visualization
positive_probs = probs[:5, 1]  # Positive class probabilities
negative_probs = probs[:5, 0]  # Negative class probabilities

# Plotting probabilities with clear representation
plt.figure(figsize=(8, 6))
width = 0.4  # Width of the bars
positions = np.arange(len(example_ids))

# Plot bars for negative and positive probabilities
plt.bar(positions - width / 2, negative_probs, width=width, label='Negative Probability', color='orange', alpha=0.7)
plt.bar(positions + width / 2, positive_probs, width=width, label='Positive Probability', color='blue', alpha=0.7)

# Formatting the graph
plt.xticks(positions, [f'Example {i}' for i in example_ids])
plt.ylabel('Probability')
plt.title('Predicted Class Probabilities for First 5 Examples')
plt.legend()
plt.show()


In [None]:
# Import necessary library
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Compute the ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, probs[:, 1])  # False Positive Rate, True Positive Rate
roc_auc = auc(fpr, tpr)  # Area Under the Curve

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=1)  # Diagonal line
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()


In [None]:
# Calculate the GINI coefficient
gini = -1 + 2 * roc_auc

# Plotting the GINI coefficient
plt.figure(figsize=(6, 4))
plt.bar(['GINI Coefficient'], [gini], color='lightcoral', alpha=0.7)
plt.ylim(0, 1)  # GINI values range from 0 to 1
plt.title('Model GINI Coefficient')
plt.ylabel('Score')

# Annotate the bar value
plt.text(0, gini + 0.02, f'{gini:.2f}', ha='center', fontsize=12)

plt.show()
