In [1]:
import pandas as pd
from glob import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from termcolor import colored

import string
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [2]:
from typing import Dict, List, Tuple
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords as nltk_stopwords
import treetaggerwrapper as tt
import string
from collections import defaultdict


def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


class Extractor:
    def __init__(
        self,
        top_k_keywords: int = 10,
        top_n: int = 30,
        stopwords: List[str] = None,
        download_nltk=False,
    ):
        self.top_k_keywords = top_k_keywords
        self.top_n = top_n

        nltk.download("stopwords")
        self.stopwords = list(nltk_stopwords.words("english"))
        if stopwords is not None:
            self.stopwords += stopwords

        path = "tree_tagger_lib"
        self.t_tagger = tt.TreeTagger(TAGLANG="en", TAGDIR=path)

    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.apply(lambda x: str(x).lower())
        df = df.reset_index(drop=True)
        df = df.str.translate(str.maketrans("", "", string.punctuation.replace(".", "")))
        df = df.str.replace("\d+", "")

        # lemmatization
        df = df.apply(lambda x: self.t_tagger.tag_text(x))
        df = df.apply(lambda x: [t.split("\t")[-1] for t in x])
        df = df.apply(lambda x: " ".join(x))
        return df.to_list()

    def _extract_topn_from_vector(self, feature_names: List[str], sorted_items: Tuple[int, float]) -> Dict[str, float]:
        """get the feature names and tf-idf score of top n items"""

        # use only topn items from vector
        sorted_items = sorted_items[: self.top_k_keywords]

        score_vals = []
        feature_vals = []

        # word index and corresponding tf-idf score
        for idx, score in sorted_items:
            # keep track of feature name and its corresponding score
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        # create a tuples of feature, score
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]

        return results

    def _get_keywords(self, vectorizer, feature_names, doc):
        """Return top k keywords from a doc using TF-IDF method"""
        # generate tf-idf for the given document
        tf_idf_vector = vectorizer.transform([doc])

        # sort the tf-idf vectors by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())

        # extract only TOP_K_KEYWORDS
        keywords = self._extract_topn_from_vector(feature_names, sorted_items)
        return list(keywords.keys())

    def get_top_keywords(self, df: pd.DataFrame):
        corpora = self._preprocess(df)
        vectorizer = TfidfVectorizer(stop_words=self.stopwords, smooth_idf=True, use_idf=True)
        vectorizer.fit(corpora)
        feature_names = vectorizer.get_feature_names_out()

        # Get top_keywords from TFIDF for each document(review)
        corpora_top_keywords = []
        for doc in corpora:
            d = {}
            d["full_text"] = doc
            d["top_keywords"] = self._get_keywords(vectorizer, feature_names, doc)
            corpora_top_keywords.append(d)
        corpora_top_keywords = pd.DataFrame(corpora_top_keywords)

        word_frequency = defaultdict(int)

        # Count weight for each word based on its position in top_keywords
        for i_row in range(corpora_top_keywords.shape[0]):
            words = corpora_top_keywords.iloc[i_row].top_keywords
            for i, word in enumerate(words):
                word_frequency[word] += 1 / (1 + i)

        word_frequency = dict(sorted(word_frequency.items(), key=lambda item: item[1], reverse=True)[: self.top_n])
        return word_frequency

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


# code

In [3]:
df = pd.read_csv('aws_reviews_sample.csv').review
df = df.apply(lambda x: str(x).replace('\n', '').replace('\r', ''))

In [4]:
# from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error unknown url type:
[nltk_data]     https>


False

In [11]:
lemmatizer = WordNetLemmatizer()
  
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
  
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\nogay/nltk_data'
    - 'c:\\Users\\nogay\\Desktop\\dash_zircon\\venv\\nltk_data'
    - 'c:\\Users\\nogay\\Desktop\\dash_zircon\\venv\\share\\nltk_data'
    - 'c:\\Users\\nogay\\Desktop\\dash_zircon\\venv\\lib\\nltk_data'
    - 'C:\\Users\\nogay\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [9]:
extractor = Extractor()

[nltk_data] Error loading stopwords: <urlopen error unknown url type:
[nltk_data]     https>


TreeTaggerError: Bad TreeTagger directory: c:\Users\nogay\Desktop\dash_zircon\nlp model\tree_tagger_lib

In [6]:
import treetaggerwrapper as tt
def textclean(df):
    df = df.apply(lambda x: x.lower())
    df = df.reset_index(drop=True)
    df = df.str.translate(str.maketrans('', '', string.punctuation.replace('.','')))
    df = df.str.replace('\d+', '')
    
    #lemmatization
    path = 'TreeTagger/tree-tagger-MacOSX-3.2.3'
    t_tagger = tt.TreeTagger(TAGLANG ='en', TAGDIR =path)

    df = df.apply(lambda x: t_tagger.tag_text(x))
    df = df.apply(lambda x: [t.split('\t')[-1] for t in x])
    df = df.apply(lambda x: ' '.join(x))
    return df

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [7]:
df = textclean(df)

  df = df.str.replace('\d+', '')


TreeTaggerError: Bad TreeTagger directory: c:\Users\nogay\Desktop\dash_zircon\nlp model\TreeTagger\tree-tagger-MacOSX-3.2.3

In [None]:
def get_top_keywords(df, STOPWORDS, top_n = 30):
    
    STOPWORDS += list(stopwords.words('english'))
    corpora = df.to_list() 
    vectorizer = TfidfVectorizer(stop_words=STOPWORDS, smooth_idf=True, use_idf=True)
    vectorizer.fit(corpora)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get top_keywords from TFIDF for each document(review)
    corpora_top_keywords = []
    for doc in corpora:
        d = {}
        d['full_text'] = doc
        d['top_keywords'] = get_keywords(vectorizer, feature_names, doc)
        corpora_top_keywords.append(d)
    corpora_top_keywords = pd.DataFrame(corpora_top_keywords)

    from collections import defaultdict
    word_frequency = defaultdict(int)
    
    # Count weight for each word based on its position in top_keywords
    for i_row in range(corpora_top_keywords.shape[0]):
        words = corpora_top_keywords.iloc[i_row].top_keywords
        for i, word in enumerate(words):
            word_frequency[word] += 1 / (1 + i)     

    word_frequency = dict(sorted(word_frequency.items(), key=lambda item: item[1], reverse=True)[:top_n])
    return word_frequency

In [None]:
# cloud = WordCloud(background_color = 'white').generate_from_frequencies(dictionary)
# plt.figure(figsize=(16,12))
# plt.imshow(cloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

In [None]:
feature_names

In [None]:
keyword = 'hold'
x = keyword_wordcloud(df, original, keyword)

In [None]:
keyword = 'basket'
x = keyword_wordcloud(df, original, keyword)

In [None]:
keyword = 'long'
res = keyword_wordcloud(res.lemmatized, res.original, keyword)

In [None]:
keyword = 'wish'
res = keyword_wordcloud(res.lemmatized, res.original, keyword)

In [None]:
all_wordcloud(res.lemmatized)