In [1]:
import json

from bs4 import BeautifulSoup
import urllib
import nltk

In [2]:
# nltk.download()

In [3]:
# The number in the brackets ([]), indicates the number of records to process. 
# Change the number for a larger sample size. 
title_dict = {}
with open('history.json') as json_data:
    d = json.load(json_data)
    for entry in d[:500]:
        if entry['title'] != '':
            if entry['url'] not in title_dict:
                title_dict[entry['url']] = entry['title']
        else:
            try:
                r = urllib.urlopen(entry['url']).read()
                soup = BeautifulSoup(r)
                title_dict[entry['url']] = soup.title.string
            except: 
                pass

In [4]:
title_dict.values()

dict_values(['Inbox - ce.sandoval09@gmail.com - Gmail', 'Download Anaconda Now! | Continuum', 'ACT Monday Workshop - ce.sandoval09@gmail.com - Gmail', 'Sent Mail - ce.sandoval09@gmail.com - Gmail', 'Drafts (57) - ce.sandoval09@gmail.com - Gmail', 'Messenger', 'Installing NLTK Data — NLTK 3.0 documentation', 'nltk download - Google Search', '20.6. urllib2 — extensible library for opening URLs — Python 2.7.13 documentation', 'Anaconda package list | Continuum Analytics: Documentation', 'Jupyter Notebook', 'parse_history.py (editing)', 'Dropbox (MIT)/Workshops_2/17.Spring ACT/', 'Dropbox (MIT)/Workshops_2/', 'Dropbox (MIT)/', 'Home', 'Messenger', 'Command not found: jupyter · Issue #2247 · jupyter/notebook', '3. Running the Jupyter Notebook — Jupyter/IPython Notebook Quick Start Guide 0.1 documentation', 'ipython - Unable to set up Jupyter Notebook - Stack Overflow', 'python - Jupyter notebook command does not work on Mac - Stack Overflow', '-bash: jupyter: command not found - Google Sear

In [5]:
from os import listdir
from os.path import isfile, join
from nltk.corpus import stopwords

from gensim import corpora, models, similarities
import numpy as np

import json
import csv

In [6]:
titles = title_dict.values()

clean_titles = []
for ctitle in titles:
    if ctitle != None:
        clean_titles.append(ctitle)
dictionary = corpora.Dictionary([word.lower().split(' ') for word in clean_titles])
browsing_titles = [word.lower().split(' ') for word in clean_titles]

In [7]:
with open("wordsEn.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)

def is_english_word(word):
    return word.lower() in english_words

only_english_ids = [word[1] for word in dictionary.token2id.items() if not is_english_word(word[0])]      

In [8]:
# get a list of stop words from the nltk library
stoplist = stopwords.words('english')

# DICTERATOR: remove stop words and words that appear only once 
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]

# filter the tokens from the corpora dict
dictionary.filter_tokens(stop_ids + once_ids + only_english_ids)

In [9]:
# remove gaps in id sequence after words that were removed
dictionary.compactify() 
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1274ca3c8>

In [10]:
# given a dictionary and a list of ids, get the words that correspond to the ids
def get_singles(dictionary, ids):
    for word_id in ids:
        yield dictionary.get(word_id)

# eliminate the words that appear only once        
def filter_singles(singles, texts):
    for text in texts:
        new_list = []
        for word in text:
            if word not in singles:
                new_list.append(word)
        yield new_list

In [11]:
singles = get_singles(dictionary, once_ids)
filtered_texts = filter_singles(singles, [word.lower().split(' ') for word in clean_titles])

In [12]:
# # Create Bag of words
mm = [dictionary.doc2bow(text) for text in filtered_texts]

In [13]:
# define the number of topics for the classification
num_topics = 2

# Trains the LDA models with the corpus and dictionary previously created
lda = models.ldamodel.LdaModel(corpus=list(mm), id2word=dictionary, num_topics=num_topics, 
                               update_every=1, chunksize=10000, passes=10, iterations=50)

In [14]:
# prints all groups and their main words
lda.print_topics(num_topics=num_topics, num_words=25)

[(0,
  '0.076*"history" + 0.055*"search" + 0.049*"export" + 0.038*"browsing" + 0.034*"google" + 0.025*"climbing" + 0.024*"support" + 0.022*"ice" + 0.022*"community" + 0.017*"home" + 0.015*"last" + 0.015*"welcome" + 0.014*"download" + 0.014*"sign" + 0.014*"new" + 0.013*"browser" + 0.012*"campgrounds" + 0.012*"moments" + 0.012*"buses" + 0.012*"yosemite" + 0.012*"trip" + 0.012*"job" + 0.012*"save" + 0.012*"messenger" + 0.012*"safari"'),
 (1,
  '0.048*"google" + 0.034*"sales" + 0.034*"ticket" + 0.029*"data" + 0.023*"tax" + 0.022*"banking" + 0.020*"contact" + 0.020*"first" + 0.018*"blue" + 0.018*"information" + 0.018*"painting" + 0.018*"credit" + 0.017*"bank" + 0.016*"search" + 0.015*"maps" + 0.015*"account" + 0.014*"stack" + 0.014*"overflow" + 0.013*"america" + 0.012*"health" + 0.012*"tickets" + 0.012*"washington" + 0.012*"april" + 0.012*"register" + 0.012*"free"')]

In [15]:
# get a list of processed topics obtained by training an LDA model, and return them as individual lists of topics and frequencies
def parse_topics(filepath):
    with open(filepath, 'rU') as f:
        reader = list(csv.reader(f))
        header = reader[0]
        reader.pop(0)
        topics = []
        freqs = []
        for row in reader:
            freq = []
            topic = []
            row.pop(0)
            for ind, element in enumerate(row):
                if ind%2 == 0:
                    try: 
                        fr = row[ind+1]
                    except: 
                        fr = ''
                    if fr != '':
                        topic.append(element)
                        freq.append(row[ind+1])
            topics.append(topic)
            freqs.append(freq)
        return topics, freqs

In [16]:
# Assigns the topics to the documents in corpus
lda_corpus = lda[mm]
threshold = 1/float(num_topics)

In [17]:
import random
import requests
from knowledge_ontologies.spellcheck import spellcheck
import re
import json
from pprint import pprint
from knowledge_ontologies.Scraper import *
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import sys
import spacy
import itertools
import numpy as np
from knowledge_ontologies.check_lemmas import check_lemmas
from sklearn.metrics.pairwise import cosine_similarity
from knowledge_ontologies.config import L1,L2,L3

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/csandova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
full_ontology=L1+L2+L3
full_list=set(full_ontology)
nlp = spacy.load('en_core_web_md')

In [19]:
def classify_closest_topics(user_keyword):
    with open ('knowledge_ontologies/vectors_map.json','r') as f:
        vector_dict=json.load(f)

        keyword=None
        while keyword!= 'N':
            if keyword== 'N':
                break

            keyword=user_keyword.lower()

            checked=spellcheck(keyword)

            ##check if it is an existing key in the ontology
            if keyword in full_list:
                print(keyword,11111111111)
                best = keyword
                return best
            else:
                secondary_check=check_lemmas(checked)
                if secondary_check:
                    print(secondary_check,2222222222222222)
#                     continue

            keyword_doc = list(nlp.pipe(checked,
              batch_size=10000,
              n_threads=1))


            if keyword_doc[0].has_vector:
                keyword_vector=np.array([keyword_doc[0].vector])
            else:
                spacy.vocab[0].vector

            intermidiate_results=[]
            best=None
            best_similarity=0
            ## First level

            l1_keys=treeL1.keys()
            arrays=[]
            order=[]
            for k in l1_keys:
                order.append(k)
                arrays.append(np.array(vector_dict[k]))

            simple_sim = cosine_similarity(keyword_vector, arrays)
            topic_idx = simple_sim.argmax(axis=1)[0]
            best_similarity=np.amax(simple_sim)
            result=order[topic_idx]
            best=result
            # print('r1: ', result )

            ##second LEvel
            l2_keys=[word for word in treeL1[result] if word in treeL2]
            # print('choices: ', l2_keys)
            arrays=[]
            order=[]
            for k in l2_keys:
                order.append(k)
                arrays.append(np.array(vector_dict[k]))

            simple_sim = cosine_similarity(keyword_vector, arrays)
            topic_idx = simple_sim.argmax(axis=1)[0]
            result=order[topic_idx]
            maxv=np.amax(simple_sim)
            if (maxv>=best_similarity):
                best_similarity=maxv
                best=result
            # print('r2: ',result)


            options=[ word for word in treeL2[result] if word in full_ontology]
            # print('choices: ', options)
            #print('options',options)
            arrays=[]
            for k in options:
                #print(k,len(vector_dict))
                arrays.append(np.array(vector_dict[k]))

            simple_sim = cosine_similarity(keyword_vector, arrays)
            topic_idx = simple_sim.argmax(axis=1)[0]
            maxv=np.amax(simple_sim)
            result=options[topic_idx]
            if (maxv>=best_similarity):
                best_similarity=maxv
                best=result
            print( 'classified as: '+best)
            return best
#             break

In [20]:
def usertopics_to_ontologies(all_topics):
    all_ontologies = []
    print(all_topics)
    for topics in all_topics:
        ontologies = []
        for topic in topics:
            ontologies.append(classify_closest_topics(topic))
        all_ontologies.append(ontologies)
    print (all_ontologies)
    return all_ontologies

In [21]:
topics, frequencies = parse_topics('knowledge_topic_classification.csv')
topics, frequencies

  This is separate from the ipykernel package so we can avoid doing imports until


([['science', 'economics'], ['asfasf', 'afsafsa']],
 [['0.5', '0.25'], ['0.7', '0.8']])

In [22]:
topics = usertopics_to_ontologies(topics)

[['science', 'economics'], ['asfasf', 'afsafsa']]
food preparation 2222222222222222
classified as: leisure
economics 11111111111
classified as: community
classified as: community
[['leisure', 'economics'], ['community', 'community']]


In [23]:
# given a corpus trained with the LDA classifier, and a threshold, classify the browsing history into the groups 
def classify(lda_corpus, texts, cluster_num, threshold, words=None, frequencies=None):
    for i,j in zip(lda_corpus, texts):
        try: 
            if i[cluster_num][1] > threshold :
                classified_list = [j, words[cluster_num], frequencies[cluster_num]]
                yield classified_list
        except: pass

In [24]:
# function that takes the topic classification of a given topic, and other data of the topics and writes a new json to be spatially joined
def topic_to_json(topic_num, topics, frequencies):
    for i, record in enumerate(classify(lda_corpus, clean_titles, topic_num, threshold, topics, frequencies)):  
        title, topic, frequency = record
        with open('topics/%stopic_history.json' %(str(topic_num)+'_'+str(i)), 'w') as f:
            f.write( json.dumps({'id': str(topic_num)+'_'+str(i), 'title':title, 'topic':topic, 'frequency':frequency}))
            #print 'wrote tweet %s' %(tid)

In [25]:
# for every topic group, write json files for every tweet
for topic_num in np.arange(num_topics):#lda_corpus, jsons_to_mm_tuple(twi_path), topic_num, threshold, num_topics): 
    topic_to_json(topic_num, topics, frequencies)


In [26]:
from os import listdir
from os.path import isfile, join

onlyfiles = [f for f in listdir('topics') if isfile(join('topics', f))]
count_topics = {}
for file in onlyfiles:
    if file.endswith('.json'):
        with open('topics/'+file, 'r') as f:
            curr_record = json.load(f)
            curr_topics = curr_record['topic']
            for curr_topic in curr_topics:
                if curr_topic not in count_topics:
                    count_topics[curr_topic] = 0
                else: 
                    count_topics[curr_topic] += 1


count_topics

{'community': 433,
 'leisure': 203,
 'economics': 203,
 'travel': 4,
 'places to eat and drink': 4}

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt

x_vals = count_topics.keys()
y_vals = count_topics.values()


y_pos = np.arange(len(x_vals))
 
plt.bar(y_pos, y_vals, align='center', alpha=0.5)
plt.xticks(y_pos, x_vals)
plt.ylabel('Number of Ocurrences')
plt.title('Domains of Knowledge')
 
plt.show()

<Figure size 640x480 with 1 Axes>

2020-01-31T01:59:40+0000 [stdout#info] Dictionary(0 unique tokens: [])
2020-01-31T01:59:40+0000 [twisted.web.wsgi._ErrorStream#error] [2020-01-31 01:59:40,207] ERROR in app: Exception on /analyzer/titles/bbKaPJCMN9WM9LQ4VNNQ [POST]
	Traceback (most recent call last):
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/app.py", line 2446, in wsgi_app
	    response = self.full_dispatch_request()
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/app.py", line 1951, in full_dispatch_request
	    rv = self.handle_user_exception(e)
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/app.py", line 1820, in handle_user_exception
	    reraise(exc_type, exc_value, tb)
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/_compat.py", line 39, in reraise
	    raise value
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/app.py", line 1949, in full_dispatch_request
	    rv = self.dispatch_request()
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/flask/app.py", line 1935, in dispatch_request
	    return self.view_functions[rule.endpoint](**req.view_args)
	  File "/home/ubuntu/backend/backend/backend/routes/analyzer.py", line 11, in handle_titles
	    results = classifier.process_titles(request.json)
	  File "/home/ubuntu/backend/backend/backend/ml/classifier.py", line 80, in process_titles
	    update_every=1, chunksize=10000, passes=10, iterations=50)
	  File "/home/ubuntu/backend/backend/env/lib/python3.5/site-packages/gensim/models/ldamodel.py", line 441, in __init__
	    raise ValueError("cannot compute LDA over an empty collection (no terms)")
	ValueError: cannot compute LDA over an empty collection (no terms)
2020-01-31T01:59:40+0000 [twisted.web.wsgi._ErrorStream#error] 