In [16]:
import spacy
import nltk
import pandas as pd
import numpy as np
import random
import math

from spacy import displacy
from spacy.symbols import nsubj, VERB

In [2]:
# this is the class that will deal with getting the data out as an API response
class ApiWorker:
    pass




# takes the message as the input, runs it through a NLP pipeline and extracts the keywords
class KeywordExtractor:
    pass


# takes in the data/metadata and enhances it 
# eg: oembed image and description for links that enable unfurling
class MetadataEnhancer:
    pass

# connects the block or the keywords to the various parts of the app 
# in a sense this will form the associations
class Connecter:
    pass

# this will peform update actions on a block 
# (how is this different from MetadataEnhancer in its current functionality)
class Updater:
    pass

In [3]:


# this is the multimedia block that can store different types of payloads 
# eg: text, image, voice, .....
class Payload:
    def __init__(self, payload):
        self.payload = payload

# this is the NL description that the user adds that underpins the self-organizing memory model part
class Description:
    def __init__(self, description):
        self.description = description


# stores the content as is received
# we are forgoing composition for the time being to aid in simplicity
# we will implement composition later
class ThoughtBlock:
    def __init__(self, payload, description):
        self.payload = payload 
        self.description = description
    
    def print_attributes(self):
        print(f"Payload: {self.payload}\n Description: {self.description}")


In [48]:
import spacy


# this is where the heavy lifting of the nlp parts of the app will happen
class Pipeline:
    
    """
    This is where the nlp heavylifting will occur.
    We create a global nlp pipeline using spacy that is accessible to all instances of the class. 
    we will have action methods that perform the following (list to keep adding to): 
    - tokenization, 
    - lemmatization, 
    - NER 
    
    """
    
    
    nlp = spacy.load('en_core_web_sm')
    
    def __init__(self, thought):
        self.thought = thought
        self.doc = self.nlp(self.thought.description)
        
        self.extractor = self.Extractor(self.doc)
        
        self.tokens = self.extractor.tokens()
        self.tags = self.extractor.tags()
        self.entities = self.extractor.entities()
        self.nouns = self.extractor.nouns()
        self.nounchunks = self.extractor.nounchunks()
        self.dependencies = self.extractor.dependencies()
        self.sentences = self.extractor.sentences()

    
    # does the work of extracting all the features
    class Extractor:
        
        def __init__(self, doc):
            self.doc = doc
        
        def tokens(self):
            return [token for token in self.doc]

        def tags(self):
            return [token.tag_ for token in self.doc]

        def entities(self):
            return [(ent.text, ent.label_) for ent in self.doc.ents]

        def nouns(self):
            return [(token, tag) for (token, tag) in zip(self.tokens(), self.tags()) if 'NN' in tag] 

        def nounchunks(self):
            return [chunk.text for chunk in self.doc.noun_chunks]
        
        def dependencies(self):
            return [(t.dep_,[child for child in t.children]) for t in self.tokens()]
        
        def display(self):
            return displacy.render(self.doc, style = "dep")
        
        def arcs(self):
            verbs = set()
            for token in self.doc:
                if token.dep == nsubj and token.head.pos == VERB:
                    verbs.add((token, token.head))
            return verbs
        
        def sentences(self):
            return [sent for sent in self.doc.sents]  
    
    
    def print_attributes(self):
#        print('----------------'*5)
#        self.thought.print_attributes()
#        print('----------------'*5)
#        print(f"tokens: {self.tokens}")
#        print('----------------'*5)
#        print(f"tags: {self.tags}")
        print('----------------'*5)
        print(f"entity: {self.entities}")
        print('----------------'*5)
        print(self.nouns)
        print('----------------'*5)
        print(f"noun chunks: {self.nounchunks}")
        print('----------------'*5)
        print(f"dependencies: {self.dependencies}")
        print('----------------'*5)
        print(f"sents: {self.sentences}")
#        print('----------------'*5)
        self.extractor.display()
#        print(list(zip(self.tokens, self.tags)))
        print(self.extractor.arcs())


In [47]:
p = Pipeline(t2)
print(t2.description)
p.print_attributes()

Found from Lenny Rachitsky's website. Looks like a great set of reads for founders right from fundraising, Investor relations, to Product and Hiring.
--------------------------------------------------------------------------------
entity: [("Lenny Rachitsky's", 'PERSON')]
--------------------------------------------------------------------------------
[(Lenny, 'NNP'), (Rachitsky, 'NNP'), (website, 'NN'), (set, 'NN'), (reads, 'NNS'), (founders, 'NNS'), (fundraising, 'NN'), (Investor, 'NN'), (relations, 'NNS'), (Product, 'NN'), (Hiring, 'NN')]
--------------------------------------------------------------------------------
noun chunks: ["Lenny Rachitsky's website", 'a great set', 'reads', 'founders', 'fundraising', 'Product', 'Hiring']
--------------------------------------------------------------------------------
dependencies: [('ROOT', [from, .]), ('prep', [website]), ('compound', []), ('poss', [Lenny, 's]), ('case', []), ('pobj', [Rachitsky]), ('punct', []), ('ROOT', [like, right, ,,

set()


In [44]:
t = ThoughtBlock('Anudeep', 'This is me, and I am the owner of all forthcoming thoughts, with a primary interest in Sidebrain')
t2 = ThoughtBlock('https://techcrunch.com/2021/11/22/jina-ai-raises-30m-for-its-for-its-neural-search-platform/',"""Found from Lenny Rachitsky's website. Looks like a great set of reads for founders right from fundraising, Investor relations, to Product and Hiring.""")

print("\n",t.payload, t.description,'\n')

#p = Pipeline(t)
#p.print_attributes()


 Anudeep This is me, and I am the owner of all forthcoming thoughts, with a primary interest in Sidebrain 



In [90]:
nlp2 = spacy.load('en_core_web_sm')
doc2 = nlp2(t2.description)

for t in doc2:
    print(f"\n{t.text:12} {t.dep_:>10} {str(t.head):>15} {str(t.lex.norm_):>15}\n", end ='-'*64)


Found              ROOT           Found           found
----------------------------------------------------------------
from               prep           Found            from
----------------------------------------------------------------
Lenny          compound       Rachitsky           lenny
----------------------------------------------------------------
Rachitsky          poss         website       rachitsky
----------------------------------------------------------------
's                 case       Rachitsky              's
----------------------------------------------------------------
website            pobj            from         website
----------------------------------------------------------------
.                 punct           Found               .
----------------------------------------------------------------
Looks              ROOT           Looks           looks
----------------------------------------------------------------
like               prep        