In [1]:
import pandas as pd
import requests
import re
import scrapy
import translators as ts
import yake
import gensim
import gensim.corpora as corpora
import pickle

Using state North Rhine-Westphalia server backend.


In [2]:
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
data= pd.read_csv("angellist.csv")
data

Unnamed: 0.1,Unnamed: 0,joined,kind,link,location,market,name,pitch,raised,size,stage,website,query,type_,tech
0,0,Jul 10,Startup,https://angel.co/vufind-1,Berkeley,Enterprise Resource Planning,Vufind (DBA DeepVu),\r\nDeep learning as a service for maximizing ...,"$680,000",11-50,Seed,deepvu.co,artificial intelligence,Startup,Python
1,1,Dec 10,Startup,https://angel.co/nmodes,Toronto,Customer Service,nmodes,\r\nnmodes creates Artificial Intelligence sol...,"$100,000",1-10,Seed,nmodes.com,artificial intelligence,Startup,Python
2,2,Jan 12,Startup,https://angel.co/tagasauris,Santa Monica,Data Mining,Tagasauris,\r\nVideo Search and Discovery Platform \r\n,"$1,000,000",1-10,Seed,tagasauris.com,artificial intelligence,Startup,Python
3,3,Apr 12,Startup,https://angel.co/zoemob,San Francisco,Location Based Services,ZoeMob,\r\nGlobal Family Assistant\r\n,"$800,000",1-10,Seed,zoemob.com,artificial intelligence,Startup,Python
4,4,Nov 12,Startup,https://angel.co/encorealert,San Francisco,Machine Learning,Encore,\r\nThe first automated data scientist for mar...,"$987,000",1-10,Seed,encorehq.com,artificial intelligence,Startup,Python
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10146,10146,Feb 19,Startup,https://angel.co/turutangi-studio,West Jakarta,Animation,Turutangi Studio,\r\nWhen avatar:last air bender meet ghost bus...,-,1-10,Seed,turutangiproject.wixsite.com,time series,,Java
10147,10147,Jan 19,Startup,https://angel.co/infinite-light,New York City,Technology,Infinite Light,\r\nCreating the ground floor of the sustainab...,-,1-10,Series A,Not,neural network,,Java
10148,10148,Jan 19,Startup,https://angel.co/hands-on-artificial-neural-ne...,-,-,Hands-On Artificial Neural Networks,\r\n,-,-,Series A,-,neural network,,
10149,10149,Jan 19,Startup,https://angel.co/brief-analytics,Be?ikta?,Big Data,Brief Analytics,\r\nAI Powered Analytics\r\n,-,1-10,Series C,datainbrief.com,neural network,,HTML5


In [4]:
stemmer = PorterStemmer()

stop_words = stopwords.words("english")
# Add custom stop words (frequently occuring but add no value)
stop_words += ['about', 'us', 'contact', 'how','login', 'hello','email','home','blog','terms','conditions',
               'jobs','openings','careers','privacy','policy','legal','imprint','demo','support','team',
              'conditions']

In [5]:
def clean_text(text):
    
    # remove white spaces, html tags, numbers, special characters, punctuations
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    # perform stemming on each word
    words_filtered = [
        #stemmer.stem(word)
        word for word in words_tokens_lower if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [6]:
def read_url_content(page):
    # read the content
    soup = BeautifulSoup(page.content, "html.parser")
    
    # remove the website cookies content
    for div in soup.find_all('div', attrs={'data-nosnippet' : 'true'}):
        div.decompose()
        
    # remove footer
    for footer in soup.find_all('footer'):
        footer.decompose()
        
    # translate the content to English  
    translator = GoogleTranslator(source='auto', target='en')
    translated_text = translator.translate(soup.text[:4999])
        
    # return the cleaned content
    return clean_text(translated_text)

In [7]:
def get_cleaned_webdata(dataframe):
    # create an empty list of page data
    page_data = []
    
    for index, row in dataframe.iterrows():
        URL = "https://" + row["website"]
        print(URL)
        try:
            # access the URL
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
            page = requests.get(URL, headers = headers, verify=False)
            # append the URL content to the list
            page_data.append(read_url_content(page))
        except requests.exceptions.HTTPError as errh:
            print ("Http Error:",errh)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:",errc)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:",errt)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.RequestException as err:
            print ("OOps: Something Else",err)
            # for websites not accessible append empty string to the list
            page_data.append("")
    return page_data

In [8]:
def extract_topics(long_string):
    num_topics = 300
    # Create Dictionary
    id2word = corpora.Dictionary([long_string.split()])
    # Create Corpus
    texts = [long_string.split()]
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    try:
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
        x=lda_model.show_topics(num_topics=1, num_words=300,formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        #Below Code returns Only Words 
        for topic,words in topics_words:
            return " ".join(words)
    except Exception as e:
        print(e)
        return ""

In [10]:
test_dataset = data.sample(30)
test_dataset["WebData"] = get_cleaned_webdata(test_dataset)

https://launch.berkeley.edu




https://andia.io
Error Connecting: HTTPSConnectionPool(host='andia.io', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636BF1DC0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://teem.com




https://conversature.com




https://-
Error Connecting: HTTPSConnectionPool(host='-', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636B4ECD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://myclientnotes.com
Error Connecting: HTTPSConnectionPool(host='myclientnotes.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636B4EDC0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://humm.tech
Error Connecting: HTTPSConnectionPool(host='humm.tech', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636BB6370>: Failed to establish a new connection:



https://-
Error Connecting: HTTPSConnectionPool(host='-', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636B8ACD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://-
Error Connecting: HTTPSConnectionPool(host='-', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636B8AD00>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://motivemetrics.com




https://narrativedx.com




https://tradagongroup.com
Error Connecting: HTTPSConnectionPool(host='tradagongroup.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000166370455B0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://coolhouselabs.com




https://alikeaudience.com




https://usb4app.com
Error Connecting: HTTPSConnectionPool(host='usb4app.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636BA1B20>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://singulogic.com
Error Connecting: HTTPSConnectionPool(host='singulogic.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636BA1280>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://wiser.com




https://MineNess.com
Error Connecting: HTTPSConnectionPool(host='mineness.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016637007700>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://about.bigterminal.com
Error Connecting: HTTPSConnectionPool(host='about.bigterminal.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636C079A0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
https://yalochat.com




https://sciencebehindsweat.com




https://armoredthings.com




Error Connecting: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
https://bri.com.tw




https://grik.ly
Error Connecting: HTTPSConnectionPool(host='grik.ly', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636EB72B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://theventure.city




https://quillapp.io
Error Connecting: HTTPSConnectionPool(host='quillapp.io', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636F78100>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://ThEmployer.com
Error Connecting: HTTPSConnectionPool(host='themployer.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636F78130>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
https://petbot.ca




https://munivestor.com
Error Connecting: HTTPSConnectionPool(host='munivestor.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000016636F76C70>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))


In [11]:
test_dataset["Topics"] = test_dataset["WebData"].apply(extract_topics)

cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)


In [12]:
pickled_model = pickle.load(open('finalized_model.sav', 'rb'))
pickled_vectorizer = pickle.load(open('vectorizer.sav', 'rb'))

In [13]:
test_input = pickled_vectorizer.transform(test_dataset['Topics']).toarray().tolist()
prediction = pickled_model.predict(test_input)
print(prediction)

[1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.
 0. 1. 0. 0. 0. 0.]


In [14]:
test_dataset["Prediction"] = prediction

In [15]:
print(test_dataset)

      Unnamed: 0   joined     kind  \
4367        4367  Nov 15  Startup   
506          506  Feb 18  Startup   
4751        4751  Sep 14  Startup   
9824        9824  Jul 16  Startup   
7077        7077  Nov 17  Startup   
9074        9074  Mar 16  Startup   
3816        3816  Jun 17  Startup   
9705        9705  Aug 16  Startup   
7083        7083  Dec 17  Startup   
7923        7923  Jul 16  Startup   
4823        4823  Aug 15  Startup   
4561        4561  Jan 14  Startup   
9278        9278  Nov 16  Startup   
3068        3068  Mar 13  Startup   
538          538  May 15  Startup   
2710        2710  Sep 13  Startup   
7616        7616  Oct 17  Startup   
5180        5180  Apr 15  Startup   
8117        8117  Sep 13  Startup   
6620        6620  Jan 15  Startup   
4768        4768  May 15  Startup   
6540        6540  Jun 12  Startup   
1884        1884  May 17  Startup   
4012        4012  Nov 16  Startup   
2910        2910  Nov 14  Startup   
4330        