In [1]:
import json
import nltk
import enum
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
from typing import List,Tuple
import spacy

In [2]:
class Query_type(enum.Enum):
    TITLE = "Title Based"
    ABSTRACT = "Abstract Based"
def is_int(s):
    try:
        int(s)
        return True
    except:
        return False
class TF_IDF_IR:
    def __init__(self):
        self.documents = json.load(open("DATA/crawler/data/NLP.json","r"))
        self.lemma_title = json.load(open("DATA/Module_data/title_lemma.json","r"))
        self.lemma_abs = json.load(open("DATA/Module_data/abstract_lemma.json","r"))
        self.idf_abs = json.load(open("DATA/Module_data/idf_abstract.json","r"))
        self.idf_title = json.load(open("DATA/Module_data/idf_title.json","r"))
        self.tf_title = json.load(open("DATA/Module_data/title_tf.json","r"))
        self.tf_abs = json.load(open("DATA/Module_data/asb_tf.json","r"))
        self.nlp = spacy.load("en_core_web_sm")
        self.tokenizer = lambda s : [token.lemma_ for token in self.nlp(s) if token.lemma_ not in self.nlp.Defaults.stop_words ]
        for key in self.tf_title:
            self.tf_title[key] = {int(k) : float(self.tf_title[key][k]) for k in self.tf_title[key]}
        for key in self.tf_abs:
            self.tf_abs[key] = {int(k) : float(self.tf_abs[key][k]) for k in self.tf_abs[key]}
        self.lemma_title = {key : int(self.lemma_title[key]) for key in self.lemma_title}
        self.lemma_abs = {key : int(self.lemma_abs[key]) for key in self.lemma_abs}
        self.idf_abs =  {int(key) : float(self.idf_abs[key]) for key in self.idf_abs}
        self.idf_title =  {int(key) : float(self.idf_title[key]) for key in self.idf_title}

    def process_q(self,q : List , tf , idf , k) -> List[Tuple]:
        return sorted([(key,sum([tf[key].get(wq,0) * idf.get(wq,0) for wq in q])) for key in tf], key = lambda x : x[1] , reverse=True)[:k]
        

    def query(self,type : Query_type , input_string:str , k : int = 10) -> List:
        wk = self.tokenizer(input_string.strip().lower())
        if type == Query_type.TITLE:
            q = [int(self.lemma_title.get(w,0)) for w in wk]
            result = self.process_q(q,self.tf_title,self.idf_title , k)
        elif type == Query_type.ABSTRACT:
            q = [int(self.lemma_abs.get(w,0)) for w in wk]
            result = self.process_q(q,self.tf_abs,self.idf_abs , k)
        articles = [self.documents[id[0]] for id in result]
        return (articles,result)

### make sure you have run ./DATA/preprocess.ipynb first 

In [3]:
ir = TF_IDF_IR()

In [4]:
articles,ranking = ir.query(Query_type.ABSTRACT, "widget app from Brain& Puzzle category and a game app from Personalization category. We extracted six hundred textual reviews for each app from Google Play Android App Store. SAS® Enterprise Miner TM 7.1 is used for summarizing reviews and pulling out features, and SAS® Sentiment Analysis Studio 12.1 is used for performing sentiment analysis" , k=10)
for a,m in zip(ranking,articles):
    print(a)
    print(m["title"])
    print(m["abstract"])

('236f98e183fbd13c29eb10ebb6702616305c80c4', 99.8666452134997)
Feature-based Sentiment Analysis on Android App Reviews Using SAS® Text Miner and SAS® Sentiment Analysis Studio
Sentiment analysis is a popular technique for summarizing and analyzing consumers’ textual reviews about products and services. There are two major approaches for performing sentiment analysis; statistical model based approaches and Natural Language Processing (NLP) based approaches to create rules. In this study, we first apply text mining to summarize users’ reviews of Android Apps and extract features of the apps mentioned in the reviews. We then use NLP approach for writing rules. We use reviews of two recent apps; a widget app from Brain& Puzzle category and a game app from Personalization category. We extracted six hundred textual reviews for each app from Google Play Android App Store. SAS® Enterprise Miner TM 7.1 is used for summarizing reviews and pulling out features, and SAS® Sentiment Analysis Studio 