In [59]:
import pandas as pd
import requests
import re
import scrapy
import translators as ts
import yake
import gensim
import gensim.corpora as corpora
import pickle

In [60]:
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
data= pd.read_csv("angellist.csv")
data

Unnamed: 0.1,Unnamed: 0,joined,kind,link,location,market,name,pitch,raised,size,stage,website,query,type_,tech
0,0,Jul 10,Startup,https://angel.co/vufind-1,Berkeley,Enterprise Resource Planning,Vufind (DBA DeepVu),\r\nDeep learning as a service for maximizing ...,"$680,000",11-50,Seed,deepvu.co,artificial intelligence,Startup,Python
1,1,Dec 10,Startup,https://angel.co/nmodes,Toronto,Customer Service,nmodes,\r\nnmodes creates Artificial Intelligence sol...,"$100,000",1-10,Seed,nmodes.com,artificial intelligence,Startup,Python
2,2,Jan 12,Startup,https://angel.co/tagasauris,Santa Monica,Data Mining,Tagasauris,\r\nVideo Search and Discovery Platform \r\n,"$1,000,000",1-10,Seed,tagasauris.com,artificial intelligence,Startup,Python
3,3,Apr 12,Startup,https://angel.co/zoemob,San Francisco,Location Based Services,ZoeMob,\r\nGlobal Family Assistant\r\n,"$800,000",1-10,Seed,zoemob.com,artificial intelligence,Startup,Python
4,4,Nov 12,Startup,https://angel.co/encorealert,San Francisco,Machine Learning,Encore,\r\nThe first automated data scientist for mar...,"$987,000",1-10,Seed,encorehq.com,artificial intelligence,Startup,Python
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10146,10146,Feb 19,Startup,https://angel.co/turutangi-studio,West Jakarta,Animation,Turutangi Studio,\r\nWhen avatar:last air bender meet ghost bus...,-,1-10,Seed,turutangiproject.wixsite.com,time series,,Java
10147,10147,Jan 19,Startup,https://angel.co/infinite-light,New York City,Technology,Infinite Light,\r\nCreating the ground floor of the sustainab...,-,1-10,Series A,Not,neural network,,Java
10148,10148,Jan 19,Startup,https://angel.co/hands-on-artificial-neural-ne...,-,-,Hands-On Artificial Neural Networks,\r\n,-,-,Series A,-,neural network,,
10149,10149,Jan 19,Startup,https://angel.co/brief-analytics,Be?ikta?,Big Data,Brief Analytics,\r\nAI Powered Analytics\r\n,-,1-10,Series C,datainbrief.com,neural network,,HTML5


In [45]:
stemmer = PorterStemmer()

stop_words = stopwords.words("english")
# Add custom stop words (frequently occuring but add no value)
stop_words += ['about', 'us', 'contact', 'how','login', 'hello','email','home','blog','terms','conditions',
               'jobs','openings','careers','privacy','policy','legal','imprint','demo','support','team',
              'conditions']

In [46]:
def clean_text(text):
    
    # remove white spaces, html tags, numbers, special characters, punctuations
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    # perform stemming on each word
    words_filtered = [
        #stemmer.stem(word)
        word for word in words_tokens_lower if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [47]:
def read_url_content(page):
    # read the content
    soup = BeautifulSoup(page.content, "html.parser")
    
    # remove the website cookies content
    for div in soup.find_all('div', attrs={'data-nosnippet' : 'true'}):
        div.decompose()
        
    # remove footer
    for footer in soup.find_all('footer'):
        footer.decompose()
        
    # translate the content to English  
    translator = GoogleTranslator(source='auto', target='en')
    translated_text = translator.translate(soup.text[:4999])
        
    # return the cleaned content
    return clean_text(translated_text)

In [52]:
def get_cleaned_webdata(dataframe):
    # create an empty list of page data
    page_data = []
    
    for index, row in dataframe.iterrows():
        URL = "https://" + row["website"]
        print(URL)
        try:
            # access the URL
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
            page = requests.get(URL, headers = headers, verify=False)
            # append the URL content to the list
            page_data.append(read_url_content(page))
        except requests.exceptions.HTTPError as errh:
            print ("Http Error:",errh)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:",errc)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:",errt)
            # for websites not accessible append empty string to the list
            page_data.append("")
        except requests.exceptions.RequestException as err:
            print ("OOps: Something Else",err)
            # for websites not accessible append empty string to the list
            page_data.append("")
    return page_data

In [57]:
def extract_topics(long_string):
    num_topics = 30
    # Create Dictionary
    id2word = corpora.Dictionary([long_string.split()])
    # Create Corpus
    texts = [long_string.split()]
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    try:
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
        x=lda_model.show_topics(num_topics=1, num_words=30,formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        #Below Code returns Only Words 
        for topic,words in topics_words:
            return " ".join(words)
    except Exception as e:
        print(e)
        return ""

In [54]:
test_dataset = data.sample(20)
test_dataset["WebData"] = get_cleaned_webdata(test_dataset)
test_dataset["WebData"]

https://marax.ai
Error Connecting: HTTPSConnectionPool(host='marax.ai', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647EF1850>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://bgx.ai
Error Connecting: HTTPSConnectionPool(host='bgx.ai', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647EF1910>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
https://ytribe.co
Error Connecting: HTTPSConnectionPool(host='ytribe.co', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647EF1940>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
https://leapcommerce.com




https://projectrec.com
Error Connecting: HTTPSConnectionPool(host='projectrec.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647EE7A00>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://hoppstech.com




https://robin8.com




https://hotquant.com
Error Connecting: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
https://SimplyInsight.co
Error Connecting: HTTPSConnectionPool(host='simplyinsight.co', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647FFD700>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
https://bmwgroup.jobs




https://mgemi.com




https://relimetrics.com




https://tomis.tech




https://stravito.com




https://tielaunchpad.com




https://greencatapult.com




https://dealstampede.com
Error Connecting: HTTPSConnectionPool(host='dealstampede.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026647E2FA60>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://grabr.io




https://ampl1fe.com
Error Connecting: HTTPSConnectionPool(host='ampl1fe.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026648114FD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://evenues.com




4807                                                     
7124                                                     
7620                                                     
61      internal server errorinternal server errorthe ...
6089                                                     
3921    patrons order drinks venues drive revenue hopp...
4775    robin digital pr marketing celebrities influen...
8403                                                     
2218                                                     
9411    bmw group worldwide menu worldwide explore opp...
6524    gemi handcrafted women men italian shoes avail...
1997    relimetrics code quality automation revolution...
5063    digital marketing tour operators tomis marketi...
5553    stravito user friendly insights platform skip ...
4359    tie launchpad enterprise accelerator homeabout...
4390    green catapult inc advisory consulting leaders...
2514                                                     
1269    shop o

In [58]:
test_dataset["Topics"] = test_dataset["WebData"].apply(extract_topics)

cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)
cannot compute LDA over an empty collection (no terms)


In [76]:
pickled_model = pickle.load(open('finalized_model.sav', 'rb'))
pickled_vectorizer = pickle.load(open('vectorizer.sav', 'rb'))

In [77]:
test_input = pickled_vectorizer.transform(test_dataset['Topics']).toarray().tolist()
prediction = pickled_model.predict(test_input)
print(prediction)

[0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1.]


In [82]:
test_dataset["Prediction"] = prediction

In [84]:
print(test_dataset)

      Unnamed: 0   joined     kind                                link  \
4807        4807  Jul 14  Startup            https://angel.co/maraxai   
7124        7124  Jun 18  Startup              https://angel.co/bgx-1   
7620        7620  Nov 17  Startup      https://angel.co/young-tribe-1   
61            61  Apr 13  Startup             https://angel.co/leap-4   
6089        6089  Oct 14  Startup  https://angel.co/project-recuerdos   
3921        3921  Jul 17  Startup          https://angel.co/hoppstech   
4775        4775  Apr 15  Startup             https://angel.co/robin8   
8403        8403  Jan 16  Startup           https://angel.co/hotquant   
2218        2218  Dec 13  Startup      https://angel.co/simplyinsight   
9411        9411  May 17  Startup   https://angel.co/bmw-technology-5   
6524        6524  Sep 15  Startup             https://angel.co/m-gemi   
1997        1997  Nov 16  Startup        https://angel.co/relimetrics   
5063        5063  Oct 16  Startup    