In [None]:
!pip install spacy

In [1]:
# Import libraries and load data
import pandas as pd
import spacy
from collections import Counter


df = pd.read_csv("three_sentiments.csv")
df['average_sentiment'] = (df['bert_sentiment'] + df['vader_sentiment'] + 0.5*df['textblob_sentiment'])/2.5

data_neg = df[df['NPS'] < 6]
data_pos = df[df['NPS'] > 7]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

def NMF_TM (dataset):
    # Preparing the text data
    text_data = dataset['translated_comment'].values

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(text_data)

    # Applying NMF for Topic Modeling
    nmf_model = NMF(n_components=22, random_state=42)  # Number of topics
    nmf_topic = nmf_model.fit_transform(tfidf)

    # Displaying the top words for each topic
    def display_topics(model, feature_names, no_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f"Topic {topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

    no_top_words = 10
    display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out(), no_top_words)
    dataset['topic'] = np.argmax(nmf_topic, axis=1)
    # dataset.reset_index(inplace=True)
    # Grouping the dataset by topic and calculating mean NPS and sentiment scores for each topic
    topic_summary = dataset.groupby('topic').agg({
        'NPS': 'mean',
        'avg_sentiment': 'mean',
        'ID': 'count'
    }).rename(columns={'ID': 'responses_count'}).reset_index()

    return topic_summary

In [3]:
# Run it on positive and negative data
neg = NMF_TM(data_neg)
pos = NMF_TM(data_pos)

Topic 0:
know did doesn didn just doing companies little right moment
Topic 1:
service customer poor years unfriendly just new friendly lousy good
Topic 2:
price reduce high increase compared increases offer year gouging best
Topic 3:
answer questions didn question clear called received waiting got previous
Topic 4:
prices high increase higher bills little quite current increased lowering
Topic 5:
time long took waiting takes wait process times response connection
Topic 6:
don like talk understand think pay advertise good bills ask
Topic 7:
contact personal possible telephone got employee like impossible poor essent
Topic 8:
contract new year years essent gas offer customers number old
Topic 9:
bad communication service good accessibility advice website help experiences support
Topic 10:
recommend company friends reason companies rarely essent services decide eon
Topic 11:
expensive think way eon bills compared getting electricity quite essent
Topic 12:
lower costs network fees fee pri

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['topic'] = np.argmax(nmf_topic, axis=1)


Topic 0:
satisfaction services overall absolute maximum term complete great long communication
Topic 1:
satisfied services essent completely just eon years company overall work
Topic 2:
good communication just information explanation experience contact cooperation advice prices
Topic 3:
friendly helpful employee lady knowledgeable phone staff help dealings correct
Topic 4:
problems eon essent solved years haven ve company works didn
Topic 5:
service nice excellent quality professional customer good provider online super
Topic 6:
ok invoice communication works online contact didn resolved came receive
Topic 7:
clear explanation information simple answer easy understandable answers website use
Topic 8:
time long invoice comes came took invoices waiting arrive pay
Topic 9:
fast processing simple communication uncomplicated correct reliable professional nice accurate
Topic 10:
helped kindly quickly employee neatly lot clearly correctly nice lady
Topic 11:
problem solved solving solution eo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['topic'] = np.argmax(nmf_topic, axis=1)


In [4]:
neg

Unnamed: 0,topic,NPS,avg_sentiment,responses_count
0,0,2.983914,4.846844,373
1,1,1.981567,4.287044,217
2,2,3.156951,5.399914,223
3,3,2.255682,4.207262,352
4,4,3.045455,5.340006,220
5,5,2.840951,4.842709,547
6,6,2.973529,4.656176,340
7,7,2.147368,4.195277,285
8,8,2.387931,4.482617,464
9,9,2.072131,3.130514,305


In [5]:
pos

Unnamed: 0,topic,NPS,avg_sentiment,responses_count
0,0,9.536477,6.932941,2248
1,1,9.348741,7.695614,2185
2,2,9.100114,7.904831,1748
3,3,9.093927,7.816172,1235
4,4,9.201183,6.477791,845
5,5,9.277881,7.553877,1076
6,6,9.324786,6.794746,585
7,7,9.080036,7.117654,2224
8,8,9.182631,5.946476,2683
9,9,9.513223,6.975635,1210


## Coherence Score

In [11]:
# Re-importing necessary libraries and redefining required functions after code execution state reset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
import numpy as np

# Loading the dataset again
file_path = 'three_sentiments.csv'
data = pd.read_csv(file_path)

# Preparing the text data
text_data = data['translated_comment_preprocessed'].values

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text_data)

# Convert TF-IDF to Gensim format
corpus = Sparse2Corpus(tfidf, documents_columns=False)
id_map = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())
dictionary = Dictionary.from_corpus(corpus, id2word=id_map)

# Function to calculate coherence score for NMF
def calculate_coherence_score(tfidf_vectorizer, tfidf, n_topics, texts, dictionary):
    # Fit NMF model
    nmf_model = NMF(n_components=n_topics, random_state=42)
    W = nmf_model.fit_transform(tfidf)
    H = nmf_model.components_
    
    # Create topics
    topics = [[tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]] for topic in H]
    
    # Calculate Coherence
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

# Preparing texts for coherence calculation
texts = [doc.split() for doc in data['translated_comment_preprocessed'].tolist()]


(range(2, 11),
 [0.394961724744598,
  0.43604674196968896,
  0.39131220346762885,
  0.4397453043724028,
  0.407316580580849,
  0.4405973959930031,
  0.4650447109770488,
  0.47632757287299426,
  0.4695741054541352])

In [None]:
# Range of topics to evaluate
n_topics_range = range(2, 101)
coherence_scores = []

for n_topics in n_topics_range:
    coherence_score = calculate_coherence_score(tfidf_vectorizer, tfidf, n_topics, texts, dictionary)
    coherence_scores.append(coherence_score)

# Displaying the coherence scores for each number of topics
coherence_scores


In [13]:
coherence_scores

[0.394961724744598,
 0.43604674196968896,
 0.39131220346762885,
 0.4397453043724028,
 0.407316580580849,
 0.4405973959930031,
 0.4650447109770488,
 0.47632757287299426,
 0.4695741054541352,
 0.4735428468011607,
 0.4904712668978726,
 0.49725634812279523,
 0.5046920629027244,
 0.5109240103407463,
 0.5100003708019907,
 0.5044256031500398,
 0.5008038956081855,
 0.5035056114086479,
 0.5194522220440649,
 0.5125026483508022,
 0.5218825170345145,
 0.5277660877483594,
 0.5164933684893136,
 0.5180214460913636,
 0.5181022271765825,
 0.5154812831305873,
 0.5136278569341051,
 0.4995457327954947,
 0.5069793140080893,
 0.49907494946693753,
 0.5000046416635883,
 0.49800641729564293,
 0.501982117474703,
 0.5035260443807442,
 0.5030294779936528,
 0.5010388734741908,
 0.48837888798833495,
 0.49355617403636187,
 0.49307179549826874,
 0.4951750449358819,
 0.49638908053142267,
 0.48916492829041874,
 0.48032531943971885,
 0.48083974910542643,
 0.4895530052779714,
 0.4799764091644315,
 0.4726065170752123,
 0.

## Service Attributes Extraction

In [7]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to extract complex noun phrases using spaCy
def extract_complex_noun_phrases(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    
    return noun_phrases

# Sample customer comment
sample_comment = "The customer service was excellent, and the staff was very helpful and friendly."

# Extract complex noun phrases from the comment
attributes = extract_complex_noun_phrases(sample_comment)

print("Extracted Complex Noun Phrases:", attributes)


Extracted Complex Noun Phrases: ['The customer service', 'the staff']


In [8]:
def extract_noun_phrases(comments):
    noun_phrases = []
    for doc in nlp.pipe(comments, batch_size=50):
        for chunk in doc.noun_chunks:
            noun_phrases.append(chunk.text.lower())  # Convert to lower case for consistent counting
    return noun_phrases

comments = df['translated_comment_preprocessed']

# Extract noun phrases from all comments
all_noun_phrases = extract_noun_phrases(comments)

# Count the frequency of each noun phrase
phrase_counts = Counter(all_noun_phrases)

In [9]:
# Identify the most common noun phrases
most_common_phrases = phrase_counts.most_common(100)  # Adjust the number to get more or fewer phrases

# Extract just the phrases into a list, excluding their counts
most_common_phrases_list = [phrase for phrase, count in most_common_phrases]

print("Most Common Noun Phrases:", most_common_phrases_list)


Most Common Noun Phrases: ['everything', 'satisfaction', 'i', 'eon', 'problem', 'no problem', 'anything', 'nothing', 'speed', 'invoice', 'question', 'customer', 'employee', 'contract', 'email', 'someone', 'price', 'anyone', 'people', 'essent', 'something', 'good service', 'u', 'time', 'information', 'that', 'request', 'satisfied service', 'everyone', 'reliability', 'company', 'no one', 'answer question', 'service', 'phone', 'work', 'care', 'bill', 'long time', 'money', 'communication', 'touch', 'good communication', 'electricity', 'customer service', 'website', 'chat', 'place', 'thing', 'lady', 'no contact', 'client', 'contact', 'you', 'site', 'supplier', 'answer', 'good help', 'nee', 'professionalism', 'satisfied customer', 'fast processing', 'month', 'lower price', 'others', 'good experience', 'day', 'consumption', 'lot', 'chatbot', 'meter', 'no complaint', 'conversation', 'satisfaction service', 'internet', 'no answer question', 'account', 'app', 'satisfied eon', 'appointment', 'tra

In [10]:
loyalty_drivers = {
    "Service Quality": ["speed", "responsiveness", "customer service", "professionalism", "efficiency", "problem", "no problem", "service", "satisfied service", "satisfaction service"],
    "Product Quality": ["reliability", "technology"],
    "Value": ["price", "lower price", "value", "billing issues"],
    "Customer Experience": ["satisfaction", "customer experience", "accessibility", "transparency", "invoice"],
    "Communication": ["communication", "good communication", "information", "good information"],
}
service_attributes = [
    "satisfaction", "problem", "no problem", "speed", "invoice",
    "customer service", "communication", "good communication", "price",
    "lower price", "reliability", "information", "good information",
    "professionalism", "service", "satisfied service", "satisfaction service",
    "customer experience", "efficiency", "accessibility", "transparency",
    "responsiveness", "billing issues", "technology"
]

In [15]:
from collections import defaultdict

driver_sentiment = defaultdict(list)

def classify_and_aggregate_feedback(row):
    comment = row['translated_comment'].lower()  # Ensure matching in lowercase
    sentiment = row['avg_sentiment']
    
    # for driver, attributes in loyalty_drivers.items():        # For categories
    #     if any(attribute in comment for attribute in attributes):
    #         driver_sentiment[driver].append(sentiment)
    
    for attribute in service_attributes:            # For each attribute
        if attribute in comment:
            driver_sentiment[attribute].append(sentiment)

# Apply the function to each row in the DataFrame
data_neg.apply(classify_and_aggregate_feedback, axis=1)

# Calculate average sentiment for each loyalty driver
average_sentiment_per_driver = {driver: sum(sentiments) / len(sentiments) if sentiments else None for driver, sentiments in driver_sentiment.items()}

sorted_attributes_by_sentiment = sorted(average_sentiment_per_driver.items(), key=lambda x: x[1], reverse=True)

# Display the sorted average sentiment for each service attribute
for attribute, avg_sentiment in sorted_attributes_by_sentiment:
    print(f"{attribute}: {round(avg_sentiment, 2)}")

customer experience: 8.16
efficiency: 7.35
good communication: 6.58
reliability: 5.97
price: 5.27
speed: 5.14
responsiveness: 5.1
good information: 4.73
lower price: 4.39
transparency: 4.38
satisfaction: 4.29
technology: 4.23
no problem: 4.22
communication: 4.14
professionalism: 4.14
information: 4.1
problem: 4.06
invoice: 4.06
customer service: 4.05
service: 3.96
accessibility: 3.93
