In [18]:
# Similarity sentences helper function

def similarity_sentences(s1, s2):
    # Filter only noun, adj, and verb as keywords to compare
    # Handling sentence 1
    new_s1 = []
    for sent in nlp(s1).sents:
        for i, token in enumerate(sent):
            if i == 0:
                continue
            if token.pos_ in ['VERB', 'ADJ', 'NOUN', 'PROPN']:
                new_s1.append(token.lemma_)

    new_s1 = ' '.join(new_s1)

    # Handling sentence 2
    new_s2 = []
    for sent in nlp(s2).sents:
        for i, token in enumerate(sent):
            if i == 0:
                continue
            if token.pos_ in ['VERB', 'ADJ', 'NOUN', 'PROPN']:
                new_s2.append(token.lemma_)
            
    new_s2 = ' '.join(new_s2)

    # Similarity process
    doc1 = nlp(new_s1)
    doc2 = nlp(new_s2)

    similarity = doc1.similarity(doc2)
    # print(doc1, "<->", doc2, similarity)

    return similarity

In [19]:
def doc_based_similarity(corpus):
    rows = []
    for row in range(len(corpus)):
        columns = []
        for column in range(len(corpus)):
            result = similarity_sentences(corpus[row], corpus[column])
            columns.append(result)
        rows.append(columns)

    return np.array(rows)


In [20]:
def text_rank(input_, factor):
    # Create similarity graph object
    similarity_graph = networkx.from_numpy_array(input_)

    # Calculate pagerank algorithm
    scores = networkx.pagerank(similarity_graph)
    ranked_sentences = sorted(
                            ((score, index) for index, score in scores.items()),
                            reverse=True)

    # Filtering out based of threshold
    scores = [s[0] for s in ranked_sentences]
    thres = factor * np.max(scores)

    top_sentence_indices = []
    for item in ranked_sentences:
        if item[0] >= thres:
            top_sentence_indices.append(item[-1])

    return top_sentence_indices

In [21]:
def summarization(corpus, factor=0.5, as_string=False):
    # Create document based similarity matrix
    matrix = doc_based_similarity(corpus)
    # Get top sentence indices
    indices = text_rank(matrix, factor=factor)
    # Get result
    result = np.array(corpus)[indices]

    if as_string:
        return ''.join(['● ' + t + '\n' for t in result])
    return result

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
import networkx

import re
import os
import json

import ast

from tqdm import tqdm

In [23]:
# Load nlp model
nlp = spacy.load('en_core_web_lg')

In [24]:
# Load JSON data from a file
with open('data-2.json', 'r') as json_file:
    data = json.load(json_file)

data

{'0': {'id': 1,
  'review': 'The staff were incredibly helpful and patient, helping me find the perfect phone!',
  'date': '2024-11-01',
  'aspect': {'staff': {'ADJ': [[['helpful'], 'staff were helpful.', 'Neutral'],
     [['patient'], 'staff were patient.', 'Neutral']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Neutral'},
 '1': {'id': 2,
  'review': 'I had a great experience purchasing my phone here, the process was smooth and quick.',
  'date': '2024-11-01',
  'aspect': {'process': {'ADJ': [[['quick'], 'process was quick.', 'Positive'],
     [['smooth'], 'process was smooth.', 'Positive']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Positive'},
 '2': {'id': 3,
  'review': 'Their selection of phones is amazing, and the prices are very competitive!',
  'date': '2024-11-01',
  'aspect': {'selection': {'ADJ': [[['amazing'],
      'selection is amazing.',
      'Positive']],
    'VERB': [],
    'OTHER': []},
   'price': {'ADJ': [[['competitive'], 'price are competitive.',

In [25]:
def adjustment_aspect_data(input_, aspect):
    storage = {}
    # Preparation storage
    storage = {
            'positive': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
            'negative': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
            'neutral': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
    }
    # Get dictionary level 1
    for i, dict_1 in input_.items():
        # If the dictionary level 1 is empty, data at current index is not updated
        if len(dict_1) == 0:
            continue
        # Get dictionary level 2
        for a, dict_2 in dict_1['aspect'].items():
            # Get dictionary data at dictionary level 2
            if a != aspect:
                continue
            for l, data in dict_2.items():
                if len(data) > 0:
                    # Get through all data
                    for d in data:
                        _, t, s = d
                        storage[s.lower()][l].append(t)
                        
    return storage

In [33]:
def summarize_swot(input_, aspect):
    mapper = {
        'positive': 'strength',
        'negative': 'weakness',
        'neutral': 'fine',
    }
    
    storage = {
    'aspect': aspect,
    'strength': '',
    'weakness': '',
    'fine': '',
    }
    # Get dictionary level 1
    for sentiment, dict_1 in input_.items():
        summ = ''
        sentiment = mapper[sentiment]
        for l, data in dict_1.items():
            if len(data) > 0:
                if l == 'VERB' or l == 'OTHER':
                    if len(data) < 3:
                        summ += ''.join(['● ' + t + '\n' for t in data])
                    else:
                        summ += summarization(data, as_string=True)
                if l == 'ADJ':
                    adj = []
                    for d in data:
                        adj.append(' '.join(d.split()[2:]).replace('.',''))
                    summ += '● ' + aspect + ' is ' + ', '.join(adj) + '.\n'
        storage[sentiment] = summ
    return storage

In [34]:
def processing_swot(data, aspect):
    # Fetching data for specific aspect
    data = adjustment_aspect_data(data, aspect)
    # Summarize swot
    result = summarize_swot(data, aspect)

    return result

In [38]:
data

{'0': {'id': 1,
  'review': 'The staff were incredibly helpful and patient, helping me find the perfect phone!',
  'date': '2024-11-01',
  'aspect': {'staff': {'ADJ': [[['helpful'], 'staff were helpful.', 'Neutral'],
     [['patient'], 'staff were patient.', 'Neutral']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Neutral'},
 '1': {'id': 2,
  'review': 'I had a great experience purchasing my phone here, the process was smooth and quick.',
  'date': '2024-11-01',
  'aspect': {'process': {'ADJ': [[['quick'], 'process was quick.', 'Positive'],
     [['smooth'], 'process was smooth.', 'Positive']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Positive'},
 '2': {'id': 3,
  'review': 'Their selection of phones is amazing, and the prices are very competitive!',
  'date': '2024-11-01',
  'aspect': {'selection': {'ADJ': [[['amazing'],
      'selection is amazing.',
      'Positive']],
    'VERB': [],
    'OTHER': []},
   'price': {'ADJ': [[['competitive'], 'price are competitive.',

In [40]:
def get_all_aspects(data):
    storage = []
    # Get through level 1 dictionary
    for idx, dict_1 in data.items():
        # Get through level 2 dictionary
        for a, dict_2 in dict_1['aspect'].items():
            if a not in storage:
                storage.append(a)
    return storage

aspects = get_all_aspects(data)

print(len(aspects))
aspects

31


['staff',
 'process',
 'selection',
 'price',
 'question',
 'technician',
 'phones issue',
 'experience',
 'variety',
 'service',
 'deal',
 'buying',
 'fixing',
 'phone',
 'store',
 'warranty service',
 'layout',
 'one',
 'time',
 'place',
 'plan',
 'phone plan',
 'warranty',
 'repair',
 'same issue',
 'policy',
 'job',
 'representative',
 'support',
 'return window',
 'long lines']

In [41]:
def processing_swot_all(data):
    storage = []
    
    for a in aspects:
        temp = processing_swot(data, a)
        storage.append(temp)
    return storage


result = processing_swot_all(data)
result

  similarity = doc1.similarity(doc2)


[{'aspect': 'staff',
  'strength': '● staff is friendly, ready.\n● I could not resist offer amazing deals on phones.\n● I made right purchase.\n● I appreciate walked through setting new device.\n● I got good deal on old phone.\n● I appreciate were able.\n● I had great experience with program.\n● staff really went mile.\n● I very disorganized waited forever.\n● I m satisfied.\n',
  'weakness': '● staff is rude.\n● I had took too long.\n',
  'fine': '● staff is helpful, patient, knowledgeable, helpful in setting phone, patient with questions, informative, accommodating, reliable, not apologetic, unhelpful, unprofessional.\n● I found needed what.\n● I had to call multiple times.\n● I had to return faulty phone.\n● staff took time.\n● I felt overcharged.\n● I had took way.\n● staff gave incorrect information about phone plan.\n● I felt pressured.\n● staff made.\n● I always leave store.\n● I appreciate walked me.\n● I could not resist upgrading.\n● I bought phone.\n● staff seemed.\n● store 

In [None]:
# pd.DataFrame.from_dict(result).to_excel('fail.xlsx', index=False)

In [44]:
# # Save dictionary to a JSON file
# with open('data-table-3.json', 'w') as json_file:
#     json.dump(result, json_file, indent=4)  # 'indent=4' makes the JSON pretty-printed