In [1]:
import os
import nltk
import time
import random
import string
import pickle
import warnings
import wikipedia
import numpy as np
import pandas as pd
from pprint import pprint
import multiprocessing as mp
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from wikipedia import DisambiguationError, PageError
from sklearn.feature_extraction.text import TfidfVectorizer
from summarizer import Summarizer

warnings.filterwarnings('ignore')
random.seed(42)

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [3]:
def download_content(title, ln=100):
    try:
        content = wikipedia.page(title).content
        content = content[:content.find('==')].strip()
    except (DisambiguationError, PageError) as e:
        return None, None
    if len(content.split()) >= ln:
        return title, content
    return None, None

def downloader(k=50, pool_size=10):
    pages_fetch = {}
    complete = False
    with mp.Pool(pool_size) as pool:
        while not complete:
            titles = wikipedia.random(k)
            res = pool.map_async(download_content, titles)
            page = res.get()
            for title, content in page:
                pages_fetch[title] = content
                if len(pages_fetch) > k:
                    complete = True
                    break
    del pages_fetch[None]
    
    return pages_fetch

In [4]:
data = []
t0 = time.time()
n_wiki = 10000
wiki_pickle = 'wiki_content.pkl'
if wiki_pickle not in os.listdir():
    page_dict = downloader(k=n_wiki)
    with open(wiki_pickle, 'wb') as f:
        pickle.dump(page_dict, f)
else:
    with open(wiki_pickle, 'rb') as f:
        page_dict = pickle.load(f)
print(f'{n_wiki} wiki content downloaded in {time.time() - t0}')

10000 wiki content downloaded in 0.02422642707824707


In [5]:
def tokenizer(sent):
    tokens = word_tokenize(sent.lower())
    tokens = [w for w in tokens if w not in string.punctuation]
    stemmer = PorterStemmer()
    tokens = list(map(stemmer.stem, tokens))
    lmtzr = WordNetLemmatizer()
    tokens = list(map(lmtzr.lemmatize, tokens))
    return tokens

In [6]:
model= Doc2Vec.load("d2v.model")

In [13]:
def pred(model, df, q):
    X = df.content
    res = []
    count = 0
    for x in X:
        res.append({"index": count,"similarity":np.asscalar(np.dot(model.infer_vector(word_tokenize(q.lower())), model.infer_vector(word_tokenize(x.lower()))))})
        count = count + 1
    res = sorted(res, key = lambda i: i['similarity'], reverse=True)
    return res

def evaluate(df, X):
    y_true = []
    y_test = []
    queries = []
    for i, row in df.iterrows():
        random.shuffle(row.top10pct)
        queries.append(' '.join(row.top10pct[:5]))
        y_true.append(i)
        y_test.append(pred(vec, X, queries[-1]))
    group_size = [2, 5, 10]
    recall_k = dict(zip(group_size, [0] * len(group_size)))
    for i in range(len(queries)):
        for gs in group_size:
            recall_k[gs] += 1 if y_true[i] in y_test[i][:gs] else 0
    for gs in group_size:
        recall_k[gs] /= len(df)
    return recall_k

def search_helper(df, vec, X, query, k=5):
    ids = pred(vec, X, query)[:5]
    res = []
    for i in ids:
        res.append((df.iloc[i].title, df.iloc[i].content))
    return res

# def main():
#     main_df = pd.DataFrame(list(page_dict.items()), columns=['title', 'content'])
#     main_df['top10pct'] = None
#     for i, row in main_df.iterrows():
#         tokens = word_tokenize(row.content.lower())
#         stopwords_eng = stopwords.words('english')
#         tokens = [w for w in tokens if not (w in stopwords_eng or w in string.punctuation)]
#         freq = Counter(tokens)
#         top10 = sorted(freq.items(), key=lambda x: -x[1])[:int(len(freq) * 0.3)]
#         row.top10pct = [w for w, v in top10]
    
#     i = 100

#     while i <= len(main_df):
#         df = main_df[:i]
#         vec = TfidfVectorizer(tokenizer=tokenizer)
#         #X = vec.fit_transform(df.content)
#         #recall_k = evaluate(df, df.content)
#         print(f'Recall score for dataset of size {len(df)}.')
#         for gs, score in recall_k.items():
#             print(f'Recall@{gs}: {score}')
#         print()
#         i *= 2
#         if i > len(main_df) and len(df) < len(main_df):
#             i = len(main_df)
# main()

In [8]:
# content_list = []
# for key in page_dict:
#     content_list.append(page_dict[key])
# tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(content_list)]
# max_epochs = 100
# vec_size = 20
# alpha = 0.025
# model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
# model.build_vocab(tagged_data)
# for epoch in range(max_epochs):
#     print('iteration {0}'.format(epoch))
#     model.train(tagged_data,
#                 total_examples=model.corpus_count,
#                 epochs=model.iter)
#     # decrease the learning rate
#     model.alpha -= 0.0002
#     # fix the learning rate, no decay
#     model.min_alpha = model.alpha

In [14]:
################################################# Rocchio's Algorithm ###############################################
def relevance_feedback(selected_ids, data, query):
    model= Doc2Vec.load("d2v.model")
    feedback_docs = len(selected_ids)
    index = data.index
    total_docs = len(index)
    non_feedback_docs = total_docs - feedback_docs
    old_query = model.infer_vector(word_tokenize(query.lower()))
    selected_ids_vector = [0]*20
    non_selected_ids_vector = [0]*20
    updated_query = None
    for j in range(len(selected_ids)):
        selected_ids[j] = int(selected_ids[j])
    for j in range(total_docs):
        if j in selected_ids:
            selected_ids_vector = selected_ids_vector + model.infer_vector(word_tokenize(data.iloc[j].content.lower()))
        else:
            non_selected_ids_vector = non_selected_ids_vector + model.infer_vector(word_tokenize(data.iloc[j].content.lower()))
    updated_query = old_query + ((0.75)*(1/feedback_docs)*selected_ids_vector) - (0.25*(1/non_feedback_docs)*non_selected_ids_vector)
    print(updated_query)
    print(old_query)
    X = data.content
    res = []
    count = 0
    for x in X:
        res.append({"index": count,"similarity":np.asscalar(np.dot(updated_query, model.infer_vector(word_tokenize(x.lower()))))})
        count = count + 1
    res = sorted(res, key = lambda i: i['similarity'], reverse=True)
    return res
            
    
    
    

In [15]:
def search():
    main_df = pd.DataFrame(list(page_dict.items()), columns=['title', 'content'])
    main_df['top10pct'] = None
    for i, row in main_df.iterrows():
        tokens = word_tokenize(row.content.lower())
        stopwords_eng = stopwords.words('english')
        tokens = [w for w in tokens if not (w in stopwords_eng or w in string.punctuation)]
        freq = Counter(tokens)
        top10 = sorted(freq.items(), key=lambda x: -x[1])[:int(len(freq) * 0.3)]
        row.top10pct = [w for w, v in top10]
    model= Doc2Vec.load("d2v.model")
    min_res = int(input('Input mininum number of results: '))
    query = input('> ')
    ans = pred(model, main_df, query)[:min_res]
    ids = []
    for j in ans:
            ids.append(j['index'])
    model = Summarizer()
    for i in ids:
        dictionary={"id":i,"title":main_df.iloc[i].title,"content":main_df.iloc[i].content}
        data.append(dictionary)
        print(f'{i}\t{main_df.iloc[i].title}\n\n{model(main_df.iloc[i].content, num_sentences=3)}\n\n\n')
    n_no = input('Please enter the document ids that you feel are relevant(separated by commas): ').split(",")
    ans = relevance_feedback(n_no,main_df,query)[:min_res]
    ids = []
    for j in ans:
            ids.append(j['index'])
    for i in ids:
        dictionary={"id":i,"title":main_df.iloc[i].title,"content":main_df.iloc[i].content}
        data.append(dictionary)
        print(f'{i}\t{main_df.iloc[i].title}\n\n{main_df.iloc[i].content}\n\n\n')
    
search()

Input mininum number of results: 4
> games
1469	List of Sesame Street puppeteers

This list of Sesame Street puppeteers includes all who have worked on the show, as a regular, backup, guest puppeteer, etc., 1990–present) Aunt Edna, Betty Lou's Mommy, Blecka the Grouch, Clucky Clucky Chicken, the Cook singing "Most Important Meal", First Female Yip Yip Martian, the Goat "offended by the portrayal of goats", Louise (rabbit), Mama Countess, The Old Woman Who Lives in a Shoe (in "Fairytale Detective"), Pauline, Sherry Netherland (1993–1995), Sophie (in Episode 3364), Storybook, various characters
Tyler Bunch: ( Hair"), Sharpay (voice only), Sissy Friendly, Slam, The Super Foods cheese, one of the Three Little Pigs
Michael Earl Davis: ( 1969–1978) Ernie's left hand (1969)
Rick Lyon: (



2275	List of Nature Conservation Act rare flora of Queensland

This is a list of the flora of Queensland listed as Rare under the Nature Conservation Act 1992. ( Eungella A.N.Rodd 3798)
Lobelia douglasiana
