In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import time

In [4]:
startups_df = pd.read_excel('startups.xlsx', sheet_name='Request 2')

In [5]:
startups_df.head(5)

Unnamed: 0,ID,Company Name,Date,Description,Industry,Industry2
0,1,Enclarity Inc,2005-01-01,"Enclarity, Inc. is a United States-based healt...",Information Technology,Computer Software and Services
1,2,Ocean Entertainment Inc,2014-01-16,Ocean Entertainment Inc. is introducing the fi...,Non-High Technology,Consumer Related
2,3,Ocean Entertainment Inc,2014-01-16,Ocean Entertainment Inc. is introducing the fi...,Non-High Technology,Consumer Related
3,4,Hengyang Jinzeli Special Allop Co Ltd,1999-12-01,"Hengyang Jinzeli Special Alloy Co., Ltd. is a ...",Non-High Technology,Industrial/Energy
4,5,Verge Solutions LLC,2001-01-01,Verge Solutions LLC is a United States-based c...,Information Technology,Computer Software and Services


In [6]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    #converting the text into lower case
    text = text.lower() 
    #remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    #remove the stopwords 
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    text = ' '.join(filtered_text)
    #Apply lemmatization
    le=WordNetLemmatizer()
    tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
    text = " ".join(tokens)
    
    return text

startups_df['Description'] = startups_df['Description'].apply(clean_text)
startups_df['Description'].head(10)

0    enclarity united statesbased healthcare inform...
1    ocean entertainment introducing first high tec...
2    ocean entertainment introducing first high tec...
3    hengyang jinzeli special alloy chinabased comp...
4    verge solution united statesbased company deve...
5    cadre technology united statesbased company pr...
6    covario united statesbased independent search ...
7    latis network also known stillsecure united st...
8    cloudblue technology united statesbased compan...
9    provides online commercial property informatio...
Name: Description, dtype: object

In [7]:
# create TF-IDF vector, apply stop words, keep only 1000 features (1000 columns)
vect =TfidfVectorizer(stop_words=list(stop_words),max_features=1000)
vect_text=vect.fit_transform(startups_df['Description'])
# check the dimension and content of vect_text
print(vect_text.shape)
print(vect_text)

(60089, 1000)
  (0, 578)	0.07666078205302546
  (0, 649)	0.06628732984908636
  (0, 633)	0.1114309150778018
  (0, 686)	0.2131205891354204
  (0, 265)	0.05261161672562631
  (0, 319)	0.12298781629724369
  (0, 77)	0.06147150428162164
  (0, 8)	0.04918164222581738
  (0, 764)	0.06831761399118941
  (0, 480)	0.07248757529051318
  (0, 41)	0.05242260042267137
  (0, 990)	0.057703511031325136
  (0, 43)	0.07014694730730867
  (0, 473)	0.05955805007420204
  (0, 413)	0.06929308266431074
  (0, 823)	0.12392758583169272
  (0, 647)	0.06533178252385463
  (0, 623)	0.12313940434957549
  (0, 276)	0.044500523091125635
  (0, 804)	0.05998687612115804
  (0, 535)	0.05866243324940094
  (0, 926)	0.053956026199345926
  (0, 706)	0.06983356193209303
  (0, 118)	0.07120671049041384
  (0, 147)	0.06873488750129904
  :	:
  (60086, 1)	0.2191043591685591
  (60086, 847)	0.1316392061804365
  (60086, 574)	0.1918629742700391
  (60087, 618)	0.40502166693689146
  (60087, 395)	0.40702005712155154
  (60087, 393)	0.41385947278176455
  (6

### LDA topic modeling for 3 topics

In [8]:
#Applying the classical LDA topic-modeling for 3 topics
start_time = time.time()
lda_model=LatentDirichletAllocation(n_components=3,random_state=42,n_jobs=-1)
lda_top=lda_model.fit_transform(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 15.854220867156982 seconds ---


In [9]:
print(lda_top.shape)  # (no_of_doc,no_of_topics)
print(lda_top)

(60089, 3)
[[0.06843945 0.05650093 0.87505962]
 [0.34286984 0.18709981 0.47003034]
 [0.34433222 0.19096773 0.46470005]
 ...
 [0.69775733 0.16254875 0.13969392]
 [0.08810856 0.8263619  0.08552955]
 [0.81323908 0.0915391  0.09522182]]


In [10]:
print("Document 0: ")
for i,topic in enumerate(lda_top[9]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  8.7682743414047 %
Topic  1 :  67.72619846360291 %
Topic  2 :  23.505527194992393 %


In [11]:
# Adding the predicted topics to the startups dataframe
topics = np.argmax(lda_top, axis=1)
print(topics)
startups_df['Predicted_Topic'] = None
for i in range(0, len(startups_df)):
    topic_str = "Topic" + str(topics[i])
    startups_df.loc[i, 'Predicted_Topic'] = topic_str
    

[2 2 2 ... 0 1 0]


In [12]:
startups_df['Predicted_Topic'].value_counts()

Topic2    24808
Topic1    17909
Topic0    17372
Name: Predicted_Topic, dtype: int64

In [13]:
#Finding the most similar topic to the given industry "Information Technology"
grouped_counts = startups_df.groupby(['Industry', 'Predicted_Topic']).size().reset_index(name='Counts')
total_IT_counts = grouped_counts[grouped_counts['Industry'] == 'Information Technology']['Counts'].sum()
grouped_counts['Proportion'] = grouped_counts.apply(lambda row: row['Counts'] / total_IT_counts if row['Industry'] == 'Information Technology' else None, axis=1)
print(grouped_counts[grouped_counts['Industry'] == 'Information Technology'])
#From the below table it is clear that Topic2 is the most similar to Industry Information Technology

                 Industry Predicted_Topic  Counts  Proportion
0  Information Technology          Topic0    4081    0.113557
1  Information Technology          Topic1    9278    0.258167
2  Information Technology          Topic2   22579    0.628276


In [14]:
grouped_counts

Unnamed: 0,Industry,Predicted_Topic,Counts,Proportion
0,Information Technology,Topic0,4081,0.113557
1,Information Technology,Topic1,9278,0.258167
2,Information Technology,Topic2,22579,0.628276
3,Medical/Health/Life Science,Topic0,7471,
4,Medical/Health/Life Science,Topic1,546,
5,Medical/Health/Life Science,Topic2,563,
6,Non-High Technology,Topic0,5820,
7,Non-High Technology,Topic1,8085,
8,Non-High Technology,Topic2,1666,


In [15]:
#calculating precision and recall of Industry Information Technology

tp = grouped_counts[(grouped_counts['Industry'] == 'Information Technology') & (grouped_counts['Predicted_Topic'] == 'Topic2')]['Counts'].sum()#True Postives calculation
fp = grouped_counts[(grouped_counts['Industry'] != 'Information Technology') & (grouped_counts['Predicted_Topic'] == 'Topic2')]['Counts'].sum()
fn = grouped_counts[(grouped_counts['Industry'] == 'Information Technology') & (grouped_counts['Predicted_Topic'] != 'Topic2')]['Counts'].sum()
tn = grouped_counts[(grouped_counts['Industry'] != 'Information Technology') & (grouped_counts['Predicted_Topic'] != 'Topic2')]['Counts'].sum()

print("True Positives = ", tp)
print("False Positives = ",fp)
print("False Negatives = ",fn)
print("True Negatives = ",tn)
print("TP+TN+FP+TN = ", tp+tn+fn+fp)

#calculate precision
precision = tp/(tp+fp)
#calculate recall
recall = tp/(tp+fn)

print("\n3-topic Modelling with LDA\n")
print("Precision = ", precision)
print("Recall = ", recall)

True Positives =  22579
False Positives =  2229
False Negatives =  13359
True Negatives =  21922
TP+TN+FP+TN =  60089

3-topic Modelling with LDA

Precision =  0.910149951628507
Recall =  0.6282764761533752


In [16]:
# most important words (top 10) for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
company product technology system medical development manufacture develops research treatment 

Topic 1: 
company service operates engaged business online provides product offer chinabased 

Topic 2: 
company software solution platform data provides service user application united 



### LDA topic modelling for 10 topics

In [17]:
#Applying the classical LDA topic-modeling for 3 topics
start_time = time.time()
lda_model=LatentDirichletAllocation(n_components=10,random_state=42,n_jobs=-1)
lda_top=lda_model.fit_transform(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 10.886080026626587 seconds ---


In [18]:
print(lda_top.shape)  # (no_of_doc,no_of_topics)
print(lda_top)

(60089, 10)
[[0.01580244 0.01580247 0.01580496 ... 0.85777522 0.01580385 0.01580245]
 [0.01451276 0.0145164  0.58466519 ... 0.01451321 0.01451413 0.01451187]
 [0.01464448 0.01464818 0.58164035 ... 0.01464489 0.01464584 0.01464355]
 ...
 [0.02543878 0.02540887 0.0254035  ... 0.02540637 0.02542085 0.02540337]
 [0.35717625 0.02378617 0.02378466 ... 0.0237856  0.0237843  0.45253947]
 [0.17522868 0.02552638 0.02552787 ... 0.02553444 0.02553169 0.02552516]]


In [19]:
print("Document 0: ")
for i,topic in enumerate(lda_top[9]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  1.459473646250183 %
Topic  1 :  1.4595883737527218 %
Topic  2 :  1.4597166810091065 %
Topic  3 :  1.4594148177155974 %
Topic  4 :  1.4591601356368409 %
Topic  5 :  47.21702547880194 %
Topic  6 :  1.459297713860205 %
Topic  7 :  1.4596330857614523 %
Topic  8 :  1.459461565667403 %
Topic  9 :  41.10722850154454 %


In [20]:
# Adding the predicted topics to the startups dataframe
topics2 = np.argmax(lda_top, axis=1)
print(topics2)
startups_df['Predicted_Topic_2'] = None
for i in range(0, len(startups_df)):
    topic_str = "Topic" + str(topics2[i])
    startups_df.loc[i, 'Predicted_Topic_2'] = topic_str

[7 2 2 ... 3 9 4]


In [21]:
startups_df['Predicted_Topic_2'].value_counts()

Topic7    11304
Topic2     8330
Topic4     7238
Topic3     6905
Topic5     4958
Topic8     4863
Topic0     4452
Topic9     4313
Topic6     4022
Topic1     3704
Name: Predicted_Topic_2, dtype: int64

In [22]:
#Finding the most similar topic to the given industry2 "Computer Software and Services"
grouped_counts_t2 = startups_df.groupby(['Industry2', 'Predicted_Topic_2']).size().reset_index(name='Counts')
total_CSS_counts = grouped_counts_t2[grouped_counts_t2['Industry2'] == 'Computer Software and Services']['Counts'].sum()
grouped_counts_t2['Proportion'] = grouped_counts_t2.apply(lambda row: row['Counts'] / total_CSS_counts if row['Industry2'] == 'Computer Software and Services' else None, axis=1)
print(grouped_counts_t2[grouped_counts_t2['Industry2'] == 'Computer Software and Services']) 
#From the below table it is clear that Topic7 is the most similar to industry2 "Computer Software and Services"

                         Industry2 Predicted_Topic_2  Counts  Proportion
30  Computer Software and Services            Topic0    1091    0.070647
31  Computer Software and Services            Topic1     784    0.050767
32  Computer Software and Services            Topic2    3428    0.221978
33  Computer Software and Services            Topic3     277    0.017937
34  Computer Software and Services            Topic4     350    0.022664
35  Computer Software and Services            Topic5     363    0.023506
36  Computer Software and Services            Topic6     159    0.010296
37  Computer Software and Services            Topic7    7286    0.471800
38  Computer Software and Services            Topic8    1005    0.065078
39  Computer Software and Services            Topic9     700    0.045328


In [23]:
#calculating precision and recall of industry2 "Computer Software and Services"

tp2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] == 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] == 'Topic7')]['Counts'].sum()#True Postives calculation
fp2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] != 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] == 'Topic7')]['Counts'].sum()
fn2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] == 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] != 'Topic7')]['Counts'].sum()
tn2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] != 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] != 'Topic7')]['Counts'].sum()

print("True Positives = ", tp2)
print("False Positives = ",fp2)
print("False Negatives = ",fn2)
print("True Negatives = ",tn2)
print("TP+TN+FP+TN = ", tp2+tn2+fn2+fp2)

#calculate precision
precision2 = tp2/(tp2+fp2)
#calculate recall
recall2 = tp2/(tp2+fn2)

print("\n10-topic modelling with LDA\n")
print("Precision = ", precision2)
print("Recall = ", recall2)

True Positives =  7286
False Positives =  4018
False Negatives =  8157
True Negatives =  40628
TP+TN+FP+TN =  60089

10-topic modelling with LDA

Precision =  0.644550601556971
Recall =  0.4717995208184938


In [24]:
# most important words (top 10) for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
chinabased engaged mainly technology company development provision service beijing product 

Topic 1: 
financial company service loan payment education provides credit bank student 

Topic 2: 
company user mobile platform game content application video provides social 

Topic 3: 
company product system manufacture technology material energy equipment power water 

Topic 4: 
company disease drug medical treatment patient therapeutic therapy cancer product 

Topic 5: 
company service energy project investment operates property management estate provides 

Topic 6: 
company product brand offer operates online accessory care home others 

Topic 7: 
company data software solution management service platform provides united statesbased 

Topic 8: 
network wireless service solution company communication internet mobile develops system 

Topic 9: 
company food online service offer restaurant provides operates product platform 



### GridSearch CV with LDA

In [25]:
start_time = time.time()
search_params = {'n_components': [15, 30], 'learning_decay': [.5, .9]}

lda = LatentDirichletAllocation(n_components=3,               # Number of topics
                                max_iter=10,               # Max learning iterations
                                learning_method='online',   
                                random_state=100,          # Random state
                                batch_size=128,            # n docs in each learning iter
                                evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                n_jobs = -1,               # Use all available CPUs
                                )

model = GridSearchCV(lda, param_grid=search_params)
model.fit(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 523.822723865509 seconds ---


In [26]:
# Best Model
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(vect_text))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 15}
Best Log Likelihood Score:  -363083.0114933131
Model Perplexity:  1350.58003762671


In [27]:
start_time = time.time()
search_params = {'n_components': [15, 30], 'learning_decay': [.5, .9]}

lda = LatentDirichletAllocation(n_components=10,               # Number of topics
                                max_iter=10,               # Max learning iterations
                                learning_method='online',   
                                random_state=100,          # Random state
                                batch_size=128,            # n docs in each learning iter
                                evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                n_jobs = -1,               # Use all available CPUs
                                )

model = GridSearchCV(lda, param_grid=search_params)
model.fit(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 757.1203529834747 seconds ---


In [28]:
# Best Model
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(vect_text))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 15}
Best Log Likelihood Score:  -363083.0114933131
Model Perplexity:  1350.58003762671
