##  Keyword Mining

Mine candidate keywords associated with the category name. The keywords are used as (noisy) candidate seed words for dataless classification.

In [1]:
from collections import Counter, defaultdict
from math import log
from tqdm import tnrange
import pandas as pd
import yaml
import math

from sklearn.datasets import load_files

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")
tokenizer = English().Defaults.create_tokenizer(nlp)

In [245]:
""" available config files:
    20NG-baseball-hockey.yaml 20NG-space-med.yaml 20NG-ibm-mac.yaml   
    AGNews-world-tech.yaml AGNews-business-sports.yaml 
    NYT-football-soccer.yaml NYT-movies-television.yaml NYT-international_business-economy.yaml
    Yelp-pos-neg.yaml
    IMDB-pos-neg.yaml
"""
config_file_folder = 'configs/'
config_file = 'IMDB-pos-neg.yaml'

with open(config_file_folder + config_file) as f:
    config = yaml.load(f, Loader=yaml.Loader)
    
categories = config['categories']
seed_words = config['seed_words']
output_file = config['kw_file']
corpus_path = config['train_corpus_path']

In [213]:
from nltk import pos_tag
# the acceptable POS tags, may vary based on the task (topic/sentiment classification)
ACCEPTABLE_TAGS_TOPIC = set(['FW', 'NN', 'NNS', 'NNP', 'NNPS']) 
ACCEPTABLE_TAGS_SENTIMENT = set(['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'])
ACCEPTABLE_TAGS_SENTIMENT = set(['JJ', 'JJR', 'JJS'])
def is_valid_topic(word):
    tag = pos_tag([word])[0][1]  # pos_tag returns [('cat', 'NN')]
    if tag in ACCEPTABLE_TAGS_TOPIC:
        return True
    else:
        return False

def is_valid_sentiment(word):
    tag = pos_tag([word])[0][1]  # pos_tag returns [('cat', 'NN')]
    if tag in ACCEPTABLE_TAGS_SENTIMENT:
        return True
    else:
        return False

In [214]:
# the frequency threshold. Words with lower frequency will be filtered out
FREQ_THRESHOLD = 3  

In [215]:
def tokenize(text, lower_case=True):
    if lower_case:
        text = text.lower()
    tokens = tokenizer(text)
    return [token.text for token in tokens if token.text.isalpha()]

### 1: Read the corpus

Please run only one subsection based on the dataset you're using.

#### 1.1 Load 20NG dataset

**TODO:** uncomment the list of categories and seed words to use.

In [None]:
from sklearn.datasets import fetch_20newsgroups
# note that dataset.target_names may not be in the same order as categories
newsgroups_train = fetch_20newsgroups(subset='train', categories= categories, remove=('headers', 'footers', 'quotes'))

In [None]:
data_train = newsgroups_train.data
target_train = newsgroups_train.target
target_names = newsgroups_train.target_names
num_train_docs = len(data_train)
print("[INFO] Total", num_train_docs, "documents.")

#### 1.2 Load AG news dataset

In [190]:
corpus_path = '/Users/admin/corpora/TextClassification/AG News Corpus/'
all_classes = ['World', 'Sports', 'Business', 'Tech']

In [191]:
df = pd.read_csv(corpus_path+"train.csv", names=['label', 'title', 'description'])

# replace the label index with the label
df['label'] = df['label'].apply(lambda x: all_classes[int(x)-1])
df['text'] = df.apply(lambda x: x['title'].lower() + ' ' + x['description'].lower(), axis=1)
print(len(df))
df = df[df['label'].isin(categories)]
print(len(df))

120000
60000


In [192]:
data_train = df['text'].tolist()
num_train_docs = len(data_train)

#### 1.3 Load IMDB dataset

In [246]:
corpus_path = '/Users/admin/corpora/SentimentAnalysis/aclImdb/train/'

In [247]:
dataset = load_files(corpus_path)
data_train = dataset.data
num_train_docs = len(data_train)

#### 1.4 Load Yelp dataset

In [232]:
corpus_path = '/Users/admin/corpora/SentimentAnalysis/yelp/train/'

In [233]:
dataset = load_files(corpus_path)
data_train = dataset.data
num_train_docs = len(data_train)

#### 1.5 Load NYT dataset

In [216]:
dataset = load_files(corpus_path, categories=categories)
data_train = dataset.data
num_train_docs = len(data_train)

### 2. Create inverted index

In [248]:
counter = Counter()
for i in tnrange(num_train_docs):
    tokens = tokenize(str(data_train[i]))
    counter.update(tokens)

HBox(children=(IntProgress(value=0, max=75000), HTML(value='')))




In [249]:
vocab = { x: count for x, count in counter.items() if count >= FREQ_THRESHOLD}
print("[INFO] Vocab size:", len(vocab))

[INFO] Vocab size: 58215


In [250]:
inverted_index = defaultdict(set)
for i in tnrange(num_train_docs):
    unique_tokens = set(tokenize(str(data_train[i])))
    [inverted_index[tok].add(i) for tok in unique_tokens if tok in vocab]

HBox(children=(IntProgress(value=0, max=75000), HTML(value='')))




### 4. Use PMI to rank the keywords 

In [251]:
# sanity check. make sure the seed words are present
for s in seed_words:
    print(s, len(inverted_index[s]))

great 18666
worst 6521


In [252]:
# PMI
MIN_COOCCURENCE = 3
output_keywords = list()
for s in seed_words:
    seed_docs = inverted_index[s]
    result = dict()
    for w, docs in inverted_index.items():
        #if w == s:  # skip the seed word itself
        #    continue
        cand_docs = inverted_index[w]
        intersection = len(seed_docs.intersection(cand_docs))
        if intersection < MIN_COOCCURENCE:
            continue
        pmi = log(intersection*num_train_docs/(len(cand_docs)*len(seed_docs)))
        pmi_freq = pmi * log(intersection)
        result[w] = pmi_freq
    top_keywords = sorted(result, key=result.get, reverse=True)[:200]
    print("Seedword:", s)
    print(top_keywords[:16])
    if output_file.startswith('20NG') or output_file.startswith('AGNews') or output_file.startswith("NYT"):
        filtered_top_keywords = [kw for kw in top_keywords if is_valid_topic(kw)]
    elif output_file.startswith('IMDB') or output_file.startswith('Yelp'):
        filtered_top_keywords = [kw for kw in top_keywords if is_valid_sentiment(kw)]
    else:
        print("Dataset unknown:", output_file)
    print(filtered_top_keywords[:16])
    output_keywords.append(filtered_top_keywords[:16])

Seedword: great
['great', 'deal', 'greatest', 'fantastic', 'fenn', 'awesome', 'paco', 'yokai', 'greatness', 'shakes', 'doubtlessly', 'sherilyn', 'composers', 'corbucci', 'job', 'wonderful']
['great', 'greatest', 'fantastic', 'unkillable', 'underrated', 'rumble', 'influential', 'gorgeous', 'classic', 'fabulous', 'nyree', 'solid', 'scorsese', 'memorable', 'meticulous', 'outstanding']
Seedword: worst
['worst', 'ever', 'manos', 'awful', 'terrible', 'horrible', 'medved', 'seen', 'misfortune', 'worse', 'crap', 'garbage', 'badness', 'atrocious', 'displeasure', 'boll']
['worst', 'terrible', 'horrible', 'worse', 'atrocious', 'bad', 'unfunny', 'stupid', 'laughable', 'pathetic', 'horrendous', 'unintentional', 'ridiculous', 'crappy', 'redeemable', 'entire']


In [253]:
# MMR
MIN_COOCCURENCE = 3
output_keywords = list()
result = dict()

for w, docs in inverted_index.items():
    cand_docs = inverted_index[w]
    scores = list()
    if output_file.startswith('20NG') or output_file.startswith('AGNews') or output_file.startswith("NYT"):
        if not is_valid_topic(w):
            continue
    elif output_file.startswith('IMDB') or output_file.startswith('Yelp'):
        if not is_valid_sentiment(w):
            continue
    else:
        print("Dataset unknown:", output_file)
        continue
        
    for s in seed_words:
        seed_docs = inverted_index[s]
        #if w == s:  # skip the seed word itself
        #    continue
        intersection = len(seed_docs.intersection(cand_docs))
        if intersection < MIN_COOCCURENCE:
            pmi_freq == 0
        else: 
            pmi = log(intersection*num_train_docs/(len(cand_docs)*len(seed_docs)))
            pmi = log(len(cand_docs)) * pmi
        scores.append(pmi)
    result[w] = scores  

In [254]:
for i, s in enumerate(seed_words):
    top_keywords = sorted(result.items(), key=lambda kv: kv[1][i]-max(kv[1][1-i], 0), reverse=True)
    print(top_keywords[:16])
    output_keywords.append([k for k, _ in top_keywords[:16]])

[('great', [13.677612372086209, -3.7359466932543532]), ('greatest', [3.8321081557267003, -2.2257754820635496]), ('fantastic', [3.814703435769346, -4.82743346873582]), ('underrated', [3.1029038301076493, -5.260466778555113]), ('influential', [3.065025737381599, -2.851025732579256]), ('gorgeous', [3.0007295066517226, -2.576981045893402]), ('fabulous', [2.9568074446922963, -4.469156114835475]), ('classic', [2.8889776195575174, -1.7343578428969344]), ('meticulous', [2.856367263879169, -2.2652857887620335]), ('solid', [2.7903928822935664, -3.7371440823140123]), ('ian', [2.7887092955681654, -0.7084548605945855]), ('marvelous', [2.7707050643325997, -8.183672703791448]), ('outstanding', [2.7552408962843566, -4.940259611889717]), ('memorable', [2.7521923136617965, -2.2101482148649056]), ('ensemble', [2.6649206306128796, -4.085265448866313]), ('best', [2.641498169595701, -0.057255340055329104])]
[('worst', [-3.336432566795995, 21.451599503305832]), ('terrible', [-1.5736557516764749, 7.6991494908

In [255]:
print(output_keywords)

[['great', 'greatest', 'fantastic', 'underrated', 'influential', 'gorgeous', 'fabulous', 'classic', 'meticulous', 'solid', 'ian', 'marvelous', 'outstanding', 'memorable', 'ensemble', 'best'], ['worst', 'terrible', 'horrible', 'worse', 'atrocious', 'bad', 'unfunny', 'stupid', 'laughable', 'pathetic', 'horrendous', 'unintentional', 'ridiculous', 'redeemable', 'crappy', 'lowest']]
