In [81]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [None]:
import networkx as nx

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

import pandas as pd
import time

import re

from tqdm import tqdm

In [None]:
# Improve efficiency analysis
# 1. Semantic clustering
# 2. PageRank for selecting representation documents

In [92]:
# df = pd.read_csv('sample-mcd.csv', encoding='latin1')
df = pd.read_csv('McDonald_s_Reviews.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']].copy()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  33396 non-null  int64 
 1   review_time  33396 non-null  object
 2   review       33396 non-null  object
 3   rating       33396 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [93]:
# Apply extraction

def process_text(x):
    # Prepare sentence
    texts = expand_contractions(x)
    texts = remove_extra_spaces(x)
    texts = remove_non_ascii(x)
    
    return texts

tqdm.pandas()
df['review_processed'] = df['review'].progress_apply(process_text)

In [113]:
documents = df['review_processed'].values[:10_000]
len(documents)

10000

**Semantic Clustering**

**Selecting Representative Documents Using PageRank**

In [119]:
def bm25_similarity(documents):
    """
    Compute pairwise BM25 similarity between sentences.
    """
    tokenized_documents = [word_tokenize(sentence.lower()) for sentence in documents]
    bm25 = BM25Okapi(tokenized_documents)
    num_documents = len(documents)
    bm25_matrix = np.zeros((num_documents, num_documents))
    
    # for i, sentence in tqdm(enumerate(tokenized_sentences), desc="Calculating BM25 Scores"):
    #     scores = bm25.get_scores(sentence)  # Compute scores for current sentence against all others
    #     bm25_matrix[i] = scores
    #     # Wrap the delayed function with tqdm

    for i in tqdm(range(len(tokenized_documents)), desc="Calculating BM25 Scores"):
        scores = bm25.get_scores(tokenized_documents[i])
        bm25_matrix[i] = scores

    return bm25_matrix

def cosine_similarity_matrix(documents):
    """
    Compute pairwise cosine similarity between sentences using TF-IDF.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    return cosine_matrix

def hybrid_similarity(documents, alpha=0.5):
    """
    Combine BM25 and cosine similarity matrices with a weighting factor.
    
    Parameters:
    - sentences (list of str): List of sentences.
    - alpha (float): Weight for BM25 similarity. (1-alpha) is the weight for cosine similarity.
    
    Returns:
    - hybrid_matrix (np.array): Combined similarity matrix.
    """
    bm25_matrix = bm25_similarity(documents)
    cosine_matrix = cosine_similarity_matrix(documents)
    print(f"BM25 {bm25_matrix.shape}\nCOSINE_MATRIX {cosine_matrix.shape}")
    hybrid_matrix = alpha * bm25_matrix + (1 - alpha) * cosine_matrix
    return hybrid_matrix

def pagerank_hybrid_summarization(documents, threshold=0.75, min_documents=2, alpha=0.5):
    """
    Summarize text using a hybrid of BM25 and cosine similarity with PageRank.
    
    Parameters:
    - text (str): Input text to summarize.
    - num_sentences (int): Number of sentences in the summary.
    - alpha (float): Weight for BM25 similarity in the hybrid similarity calculation.
    
    Returns:
    - summary (str): Generated summary.
    """

    def num_documents(scores):
        scores.sort(reverse=True)
        current_value = 0
        for i in tqdm(range(len(scores)), desc="Optimize Number Documents"):
            current_value += scores[i]
            if current_value > threshold:
                return i
        return
        
    if len(documents) < min_documents:
        return documents  # Return original documents

    print("COMPUTE HYBRID SIMILARITY")
    similarity_matrix = hybrid_similarity(documents, alpha)

    print("APPLY PAGERANK")
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    print("DETERMINE NUMBER OF DOCUMENTS")
    n = num_documents(list(scores.values()))
    print("OPTIMUM SENTENCES: ", n)

      
    print("SELECT TOP N DOCUMENTS")
    ranked_indices = sorted(scores, key=scores.get, reverse=True)[:n]
    ranked_indices = sorted(ranked_indices)
    pagerank_documents = []
    for i in tqdm(range(len(ranked_indices)), desc="Sorting Documents"):
        idx = ranked_indices[i]
        pagerank_documents.append(documents[idx])

    # Step 5: Return the summary
    return pagerank_documents



# Alternative using compression

# def calculate_optimal_num_sentences(text, compression_ratio=0.2):
#     sentences = sent_tokenize(text)
#     optimal_num_sentences = max(1, int(len(sentences) * compression_ratio))  # Ensure at least 1 sentence
#     return optimal_num_sentences

# n = calculate_optimal_num_sentences(text, compression_ratio=0.75)

In [None]:
start_time = time.time()
summary = pagerank_hybrid_summarization(documents, alpha=0.7)
end_time = time.time()
print(len(summary))
print(summary)