In [1]:
import pandas as pd
import requests
import re
import scrapy
import translators as ts
import yake
import gensim
import gensim.corpora as corpora
import pickle

Using state North Rhine-Westphalia server backend.


In [2]:
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
data= pd.read_csv("angellist.csv")
data

Unnamed: 0.1,Unnamed: 0,joined,kind,link,location,market,name,pitch,raised,size,stage,website,query,type_,tech
0,0,Jul 10,Startup,https://angel.co/vufind-1,Berkeley,Enterprise Resource Planning,Vufind (DBA DeepVu),\r\nDeep learning as a service for maximizing ...,"$680,000",11-50,Seed,deepvu.co,artificial intelligence,Startup,Python
1,1,Dec 10,Startup,https://angel.co/nmodes,Toronto,Customer Service,nmodes,\r\nnmodes creates Artificial Intelligence sol...,"$100,000",1-10,Seed,nmodes.com,artificial intelligence,Startup,Python
2,2,Jan 12,Startup,https://angel.co/tagasauris,Santa Monica,Data Mining,Tagasauris,\r\nVideo Search and Discovery Platform \r\n,"$1,000,000",1-10,Seed,tagasauris.com,artificial intelligence,Startup,Python
3,3,Apr 12,Startup,https://angel.co/zoemob,San Francisco,Location Based Services,ZoeMob,\r\nGlobal Family Assistant\r\n,"$800,000",1-10,Seed,zoemob.com,artificial intelligence,Startup,Python
4,4,Nov 12,Startup,https://angel.co/encorealert,San Francisco,Machine Learning,Encore,\r\nThe first automated data scientist for mar...,"$987,000",1-10,Seed,encorehq.com,artificial intelligence,Startup,Python
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10146,10146,Feb 19,Startup,https://angel.co/turutangi-studio,West Jakarta,Animation,Turutangi Studio,\r\nWhen avatar:last air bender meet ghost bus...,-,1-10,Seed,turutangiproject.wixsite.com,time series,,Java
10147,10147,Jan 19,Startup,https://angel.co/infinite-light,New York City,Technology,Infinite Light,\r\nCreating the ground floor of the sustainab...,-,1-10,Series A,Not,neural network,,Java
10148,10148,Jan 19,Startup,https://angel.co/hands-on-artificial-neural-ne...,-,-,Hands-On Artificial Neural Networks,\r\n,-,-,Series A,-,neural network,,
10149,10149,Jan 19,Startup,https://angel.co/brief-analytics,Be?ikta?,Big Data,Brief Analytics,\r\nAI Powered Analytics\r\n,-,1-10,Series C,datainbrief.com,neural network,,HTML5


In [4]:
stemmer = PorterStemmer()

stop_words = stopwords.words("english")
# Add custom stop words (frequently occuring but add no value)
stop_words += ['about', 'us', 'contact', 'how','login', 'hello','email','home','blog','terms','conditions',
               'jobs','openings','careers','privacy','policy','legal','imprint','demo','support','team',
              'conditions']

In [5]:
def clean_text(text):
    
    # remove white spaces, html tags, numbers, special characters, punctuations
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    # perform stemming on each word
    words_filtered = [
        #stemmer.stem(word)
        word for word in words_tokens_lower if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [6]:
def read_url_content(page):
    # read the content
    soup = BeautifulSoup(page.content, "html.parser")
    
    # remove the website cookies content
    for div in soup.find_all('div', attrs={'data-nosnippet' : 'true'}):
        div.decompose()
        
    # remove footer
    for footer in soup.find_all('footer'):
        footer.decompose()
        
    # translate the content to English  
    translator = GoogleTranslator(source='auto', target='en')
    translated_text = translator.translate(soup.text[:4999])
        
    # return the cleaned content
    return clean_text(translated_text)

In [7]:
def get_cleaned_webdata(dataframe):
    # create an empty list of page data
    page_data = []
    
    for index, row in dataframe.iterrows():
        URL = "https://" + row["website"]
        print(URL)
        try:
            # access the URL
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
            page = requests.get(URL, headers = headers, verify=False)
            # append the URL content to the list
            page_data.append(read_url_content(page))
        except requests.exceptions.HTTPError as errh:
            print ("Http Error:",errh)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:",errc)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:",errt)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.RequestException as err:
            print ("OOps: Something Else",err)
            # for websites not accessible append empty string to the list
            page_data.append("")
    return page_data

In [8]:
def extract_topics(long_string):
    num_topics = 300
    # Create Dictionary
    id2word = corpora.Dictionary([long_string.split()])
    # Create Corpus
    texts = [long_string.split()]
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    try:
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
        x=lda_model.show_topics(num_topics=1, num_words=300,formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        #Below Code returns Only Words 
        for topic,words in topics_words:
            return " ".join(words)
    except Exception as e:
        print(e)
        return ""

In [80]:
test_dataset = data.sample(30)
test_dataset["WebData"] = get_cleaned_webdata(test_dataset)

https://businessevolution.be
Error Connecting: HTTPSConnectionPool(host='businessevolution.be', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027219E87910>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://collab.house
Error Connecting: HTTPSConnectionPool(host='collab.house', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027209F310A0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://HealthHeritage.org
Error Connecting: HTTPSConnecti



https://datatoweb.com




https://streetcontxt.com




https://siftr.co
Error Connecting: HTTPSConnectionPool(host='siftr.co', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002721A000A00>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://Pillow.com
Error Connecting: HTTPSConnectionPool(host='pillow.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002721A0002B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://paidy.com




https://buildgauge.com
Error Connecting: HTTPSConnectionPool(host='buildgauge.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027219658E20>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
https://legalsifter.com




https://moarstack.net




https://precily.com




https://lightboard.io




https://culturebase.io
Error Connecting: HTTPSConnectionPool(host='culturebase.io', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027219E2E1C0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://asktetra.com




https://sightly.com




Error Connecting: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
https://3scale.net




https://openbinacle.com,www.gleeon.com
Error Connecting: HTTPSConnectionPool(host='openbinacle.com,www.gleeon.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027219E53790>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://cleanwaveproducts.com




https://midas-solutions.com.mx
Error Connecting: HTTPSConnectionPool(host='midas-solutions.com.mx', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027219FA12E0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://k5ventures.com




https://nxtwv.com
Error Connecting: HTTPSConnectionPool(host='nxtwv.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000272190BEE20>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://infinario.com




https://tronic.fm




https://mindolia.com
Error Connecting: HTTPSConnectionPool(host='mindolia.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002721A0BEFD0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://mapillary.com




https://yourefound.com
Error Connecting: HTTPSConnectionPool(host='yourefound.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002721A963190>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://theopenlabel.com
Error Connecting: HTTPSConnectionPool(host='theopenlabel.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002721A963D90>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://pupptech.com
Error Connecting: HTTPSConnectionPool(hos

In [81]:
test_dataset["Topics"] = test_dataset["WebData"].apply(extract_topics)

cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)


In [82]:
pickled_model = pickle.load(open('finalized_model.sav', 'rb'))
pickled_vectorizer = pickle.load(open('vectorizer.sav', 'rb'))

In [83]:
test_input = pickled_vectorizer.transform(test_dataset['Topics']).toarray().tolist()
prediction = pickled_model.predict(test_input)
print(prediction)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 0.]


In [84]:
test_dataset["Prediction"] = prediction

In [85]:
test_dataset

Unnamed: 0.1,Unnamed: 0,joined,kind,link,location,market,name,pitch,raised,size,stage,website,query,type_,tech,WebData,Topics,Prediction
6929,6929,Feb 17,Startup,https://angel.co/business-evolution-1,Dallas,Artificial Neural Networks,Business Evolution,\r\nSalesforce.com without the constraints and...,-,-,Seed,businessevolution.be,neural network,,Python,,,0.0
4308,4308,Oct 15,Startup,https://angel.co/collab-house,Hyderabad,Product Design,Collab House,\r\nCollaborative Hub for Individuals and Star...,-,1-10,,collab.house,artificial intelligence,Incubator,Python,,,0.0
91,91,Oct 14,Startup,https://angel.co/health-heritage,Evanston,Bioinformatics,Health Heritage,\r\nAncestry.com for personalized healthcare\r\n,"$1,000,000",1-10,Seed,HealthHeritage.org,natural language processing,Startup,Python,,,0.0
777,777,Feb 14,Startup,https://angel.co/marianaiq,San Mateo,Lead Generation,MarianaIQ,\r\nAI-Powered ABM Platform\r\n,"$2,000,000",1-10,Seed,marianaiq.com,deep learning,Startup,,site configured found site looking found domai...,account successfully still steps site signed r...,0.0
207,207,Oct 13,Startup,https://angel.co/datatoweb,Montreal,Internet of Things,DataToWeb,\r\nSaaS for Time Series\r\n,"$650,000",1-10,Seed,datatoweb.com,big data,Startup,Javascript,data web data web skip content data web data w...,able played play platforms pg people paid play...,0.0
1052,1052,Apr 14,Startup,https://angel.co/street-contxt,Toronto,Finance Technology,Street Contxt,\r\n,"$8,000,000",11-50,Series A,streetcontxt.com,machine learning,Startup,Javascript,street context skip content solutions markets ...,ability nurture officially opportunity outcome...,0.0
3642,3642,Apr 15,Startup,https://angel.co/siftr,Noida,Messaging,Siftr Labs,\r\nAI powered Media Recognition Platform \r\n,-,1-10,,siftr.co,artificial intelligence,Private Company,Python,,,0.0
5680,5680,Aug 14,Startup,https://angel.co/pillow,San Francisco,Real Estate Investors,Pillow,\r\n#1 technology driven hospitality platform ...,"$16,150,000",11-50,Acquired,Pillow.com,ai,Mobile App,Python,,,0.0
6777,6777,Nov 10,Startup,https://angel.co/paidy,Japan,Payments,Paidy,\r\nCardless online payments in Asia\r\n,"$39,100,000",11-50,Seed,paidy.com,big data,,Python,postpaid paydy next month postpaid paidy use p...,able purchase qis qoo questions questionsqabou...,0.0
8369,8369,Dec 12,Startup,https://angel.co/buildgauge,London,Construction,BuildGauge,\r\nGitHub for Construction\r\n,-,-,Seed,buildgauge.com,data engineering,,Python,,,0.0


In [18]:
pickled_spp_embeddings = pickle.load(open('spp_embeddings.sav', 'rb'))
pickled_pd_embeddings = pickle.load(open('pd_embeddings.sav', 'rb'))
pickled_psd_embeddings = pickle.load(open('psd_embeddings.sav', 'rb'))

In [30]:
test_dataset["Topics"][6915]

'access scanning scanner scanifier scan robust request range pricing previous platform perspective performs owasp oriented one nutshell next newsletter neural networks network navigation modules scans scheduled secure sent went web vulnerability vulnerabilities utilizes user unconventional trained touch top toggle minds time tests testing test technologies successfully sqli speed specializes something signup sign think wide message logics cross core complex complete combining code business bug bounty better best based automating authentication audit apps approachable approach applications application also algorithms ai deeply detecting detection early logic like lets learning learn last integration integrating integrated inclusive hunters machine hunter hacker get friendly flaws flaw features extremely exteremly every error ensuring hacking wrong'

In [31]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
interana_embeddings = model.encode(test_dataset["Topics"][6915])

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
def find_cosine_sim(vector1,vector2):
    return(cosine_similarity(vector1.reshape(1, -1),vector2.reshape(1, -1))[0][0])

In [33]:
def cycle_result(list1):
    maximum = max(list1)
    return list1.index(maximum)

In [34]:
list1=[ find_cosine_sim(pickled_spp_embeddings, interana_embeddings),
            find_cosine_sim(pickled_pd_embeddings, interana_embeddings),
            find_cosine_sim(pickled_psd_embeddings, interana_embeddings)]
cycle_result(list1)

1