In [66]:
# Similarity sentences helper function

def similarity_sentences(s1, s2):
    # Filter only noun, adj, and verb as keywords to compare
    # Handling sentence 1
    new_s1 = []
    for sent in nlp(s1).sents:
        for i, token in enumerate(sent):
            if i == 0:
                continue
            if token.pos_ in ['VERB', 'ADJ', 'NOUN', 'PROPN']:
                new_s1.append(token.lemma_)

    new_s1 = ' '.join(new_s1)

    # Handling sentence 2
    new_s2 = []
    for sent in nlp(s2).sents:
        for i, token in enumerate(sent):
            if i == 0:
                continue
            if token.pos_ in ['VERB', 'ADJ', 'NOUN', 'PROPN']:
                new_s2.append(token.lemma_)
            
    new_s2 = ' '.join(new_s2)

    # Similarity process
    doc1 = nlp(new_s1)
    doc2 = nlp(new_s2)

    similarity = doc1.similarity(doc2)
    # print(doc1, "<->", doc2, similarity)

    return similarity

In [67]:
def doc_based_similarity(corpus):
    rows = []
    for row in range(len(corpus)):
        columns = []
        for column in range(len(corpus)):
            result = similarity_sentences(corpus[row], corpus[column])
            columns.append(result)
        rows.append(columns)

    return np.array(rows)


In [68]:
def text_rank(input_, factor):
    # Create similarity graph object
    similarity_graph = networkx.from_numpy_array(input_)

    # Calculate pagerank algorithm
    scores = networkx.pagerank(similarity_graph)
    ranked_sentences = sorted(
                            ((score, index) for index, score in scores.items()),
                            reverse=True)

    # Filtering out based of threshold
    scores = [s[0] for s in ranked_sentences]
    thres = factor * np.max(scores)

    top_sentence_indices = []
    for item in ranked_sentences:
        if item[0] >= thres:
            top_sentence_indices.append(item[-1])

    return top_sentence_indices

In [69]:
def summarization(corpus, factor=0.5, as_string=False):
    # Create document based similarity matrix
    matrix = doc_based_similarity(corpus)
    # Get top sentence indices
    indices = text_rank(matrix, factor=factor)
    # Get result
    result = np.array(corpus)[indices]

    if as_string:
        return ''.join(['● ' + t + '\n' for t in result])
    return result

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
import networkx

import re
import os
import json

import ast

from tqdm import tqdm

In [71]:
# Load nlp model
nlp = spacy.load('en_core_web_lg')

In [72]:
# Load JSON data from a file
with open('data-2.json', 'r') as json_file:
    data = json.load(json_file)

data

{'0': {'staff': {'data': {'ADJ': [[['helpful'],
      'staff were helpful.',
      'Neutral'],
     [['patient'], 'staff were patient.', 'Neutral']],
    'VERB': [],
    'OTHER': []},
   'sentiment': 'Neutral'}},
 '1': {'process': {'data': {'ADJ': [[['smooth'],
      'process was smooth.',
      'Positive'],
     [['quick'], 'process was quick.', 'Positive']],
    'VERB': [],
    'OTHER': []},
   'sentiment': 'Positive'}},
 '2': {'selection': {'data': {'ADJ': [[['amazing'],
      'selection is amazing.',
      'Positive']],
    'VERB': [],
    'OTHER': []},
   'sentiment': 'Positive'},
  'price': {'data': {'ADJ': [[['competitive'],
      'price are competitive.',
      'Neutral']],
    'VERB': [],
    'OTHER': []},
   'sentiment': 'Neutral'}},
 '3': {'staff': {'data': {'ADJ': [],
    'VERB': [[['appreciate', 'walk'], 'I appreciate walked me.', 'Neutral'],
     [['appreciate', 'walk'],
      'I appreciate walked through setting new device.',
      'Positive']],
    'OTHER': []},
   'sen

In [73]:
def get_data_aspect(input_, aspect):
    storage = {}
    # Preparation storage
    storage = {
            'positive': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
            'negative': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
            'neutral': {
                    'ADJ': [],
                    'VERB': [],
                    'OTHER': [],
            },
    }
    # Get dictionary level 1
    for i, dict_1 in input_.items():
        # If the dictionary level 1 is empty, data at current index is not updated
        if len(dict_1) == 0:
            continue
        # Get dictionary level 2
        for a, dict_2 in dict_1.items():
            # Get dictionary data at dictionary level 2
            if a != aspect:
                continue
            for l, data in dict_2['data'].items():
                if len(data) > 0:
                    # Get through all data
                    for d in data:
                        _, t, s = d
                        storage[s.lower()][l].append(t)
                        
    return storage


In [75]:
def summarize_swot(input_, aspect):
    storage = {
    'aspect': [aspect],
    'positive': [],
    'negative': [],
    'neutral': [],
    }
    # Get dictionary level 1
    for sentiment, dict_1 in input_.items():
        summ = ''
        for l, data in dict_1.items():
            if len(data) > 0:
                if l == 'VERB' or l == 'OTHER':
                    if len(data) < 3:
                        summ += ''.join(['● ' + t + '\n' for t in data])
                        # print(summ)
                    else:
                        summ += summarization(data, as_string=True)
                        # print(summ)
                if l == 'ADJ':
                    # summ += ''.join([t + '\n' for t in data])
                    adj = []
                    for d in data:
                        adj.append(' '.join(d.split()[2:]).replace('.',''))
                    summ += '● ' + aspect + ' is ' + ', '.join(adj) + '.\n'
        storage[sentiment] += [summ]
    return storage

In [76]:
def processing_swot(data, aspect):
    # Fetching data for specific aspect
    data = get_data_aspect(data, aspect)
    # Summarize swot
    result = summarize_swot(data, aspect)

    return result

{'aspect': ['price'],
 'positive': ['● price is reasonable, high.\n'],
 'negative': [''],
 'neutral': ['● price is competitive.\n']}

In [80]:
def get_all_aspects(data):
    storage = []
    # Get level 1 dictionary
    for idx, dict_1 in data.items():
        # Get level 2 dictionary
        for a, _ in dict_1.items():
            if a not in storage:
                storage.append(a)
    return storage

aspects = get_all_aspects(data)

In [81]:
def processing_swot_all(data):
    storage = {
    'aspect': [],
    'positive': [],
    'negative': [],
    'neutral': [],
    }
    
    for a in aspects:
        temp = processing_swot(data, a)

        for k in temp.keys():
            storage[k] += (temp[k])
    return storage


result = processing_swot_all(data)
result

  similarity = doc1.similarity(doc2)


{'aspect': ['staff',
  'process',
  'selection',
  'price',
  'question',
  'phones issue',
  'technician',
  'experience',
  'variety',
  'service',
  'deal',
  'buying',
  'fixing',
  'phone',
  'store',
  'warranty service',
  'layout',
  'one',
  'time',
  'place',
  'plan',
  'phone plan',
  'warranty',
  'repair',
  'same issue',
  'policy',
  'job',
  'representative',
  'support',
  'return window',
  'long lines'],
 'positive': ['● staff is friendly, ready.\n● I could not resist offer amazing deals on phones.\n● I made right purchase.\n● I appreciate walked through setting new device.\n● I got good deal on old phone.\n● I appreciate were able.\n● I had great experience with program.\n● staff really went mile.\n● I very disorganized waited forever.\n● I m satisfied.\n',
  '● process is smooth, quick.\n',
  '● selection is amazing, great.\n',
  '● price is reasonable, high.\n',
  '',
  '● technician fixed phones issue.\n',
  '● technician fixed phones issue.\n● technician fixed 

In [82]:
pd.DataFrame.from_dict(result)

Unnamed: 0,aspect,positive,negative,neutral
0,staff,"● staff is friendly, ready.\n● I could not res...",● staff is rude.\n● I had took too long.\n,"● staff is helpful, patient, knowledgeable, he..."
1,process,"● process is smooth, quick.\n",,● process is simple.\n
2,selection,"● selection is amazing, great.\n",● selection is limited.\n,● selection be phone.\n
3,price,"● price is reasonable, high.\n",,● price is competitive.\n
4,question,,,● question answered.\n
5,phones issue,● technician fixed phones issue.\n,,
6,technician,● technician fixed phones issue.\n● technician...,,● technician damaged during repair.\n● technic...
7,experience,● experience really know stuff.\n,,
8,variety,"● variety is impressive, excellent.\n",,● variety is patient with questions.\n
9,service,"● service is excellent, quick, outstanding, qu...","● service is poor, slow.\n","● service is reliable, notch, efficient.\n● se..."


In [84]:
# pd.DataFrame.from_dict(result).to_excel('fail.xlsx', index=False)