In [66]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import time

In [67]:
startups_df = pd.read_excel('startups.xlsx', sheet_name='Request 2')

In [68]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    #converting the text into lower case
    text = text.lower() 
    #remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    #remove the stopwords 
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    text = ' '.join(filtered_text)
    #Apply lemmatization
    le=WordNetLemmatizer()
    tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
    text = " ".join(tokens)
    return text
startups_df['Description'] = startups_df['Description'].apply(clean_text)

In [69]:
# create TF-IDF vector, apply stop words, keep only 1000 features (1000 columns)
vect =TfidfVectorizer(stop_words=list(stop_words),max_features=1000)
vect_text=vect.fit_transform(startups_df['Description'])

### LDA topic modeling for 3 topics

In [70]:
#Applying the classical LDA topic-modeling for 3 topics
start_time = time.time()
lda_model=LatentDirichletAllocation(n_components=3,random_state=42,n_jobs=-1)
lda_top=lda_model.fit_transform(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 15.493796825408936 seconds ---


In [71]:
# Adding the predicted topics to the startups dataframe
topics = np.argmax(lda_top, axis=1)
startups_df['Predicted_Topic'] = None
for i in range(0, len(startups_df)):
    topic_str = "Topic" + str(topics[i])
    startups_df.loc[i, 'Predicted_Topic'] = topic_str

In [72]:
startups_df['Predicted_Topic'].value_counts()

Topic2    24808
Topic1    17909
Topic0    17372
Name: Predicted_Topic, dtype: int64

In [73]:
#Finding the most similar topic to the given industry "Information Technology"
grouped_counts = startups_df.groupby(['Industry', 'Predicted_Topic']).size().reset_index(name='Counts')
total_IT_counts = grouped_counts[grouped_counts['Industry'] == 'Information Technology']['Counts'].sum()
grouped_counts['Proportion'] = grouped_counts.apply(lambda row: row['Counts'] / total_IT_counts if row['Industry'] == 'Information Technology' else None, axis=1)
print(grouped_counts[grouped_counts['Industry'] == 'Information Technology'])
#From the below table it is clear that Topic2 is the most similar to Industry Information Technology

                 Industry Predicted_Topic  Counts  Proportion
0  Information Technology          Topic0    4081    0.113557
1  Information Technology          Topic1    9278    0.258167
2  Information Technology          Topic2   22579    0.628276


In [74]:
#calculating precision and recall of Industry Information Technology

tp = grouped_counts[(grouped_counts['Industry'] == 'Information Technology') & (grouped_counts['Predicted_Topic'] == 'Topic2')]['Counts'].sum()#True Postives calculation
fp = grouped_counts[(grouped_counts['Industry'] != 'Information Technology') & (grouped_counts['Predicted_Topic'] == 'Topic2')]['Counts'].sum()
fn = grouped_counts[(grouped_counts['Industry'] == 'Information Technology') & (grouped_counts['Predicted_Topic'] != 'Topic2')]['Counts'].sum()
tn = grouped_counts[(grouped_counts['Industry'] != 'Information Technology') & (grouped_counts['Predicted_Topic'] != 'Topic2')]['Counts'].sum()

print("True Positives = ", tp)
print("False Positives = ",fp)
print("False Negatives = ",fn)
print("True Negatives = ",tn)
print("TP+TN+FP+TN = ", tp+tn+fn+fp)

#calculate precision
precision = tp/(tp+fp)
#calculate recall
recall = tp/(tp+fn)

print("\n3-topic Modelling with LDA\n")
print("Precision (for 3-topic LDA) = ", precision)
print("Recall (for 3-topic LDA) = ", recall)

True Positives =  22579
False Positives =  2229
False Negatives =  13359
True Negatives =  21922
TP+TN+FP+TN =  60089

3-topic Modelling with LDA

Precision (for 3-topic LDA) =  0.910149951628507
Recall (for 3-topic LDA) =  0.6282764761533752


In [75]:
# most important words (top 10) for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
company product technology system medical development manufacture develops research treatment 

Topic 1: 
company service operates engaged business online provides product offer chinabased 

Topic 2: 
company software solution platform data provides service user application united 



### LDA topic modelling for 10 topics

In [76]:
#Applying the classical LDA topic-modeling for 3 topics
start_time = time.time()
lda_model_2=LatentDirichletAllocation(n_components=10,random_state=42,n_jobs=-1)
lda_top_2=lda_model_2.fit_transform(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 10.480187892913818 seconds ---


In [77]:
# Adding the predicted topics to the startups dataframe
topics2 = np.argmax(lda_top_2, axis=1)
startups_df['Predicted_Topic_2'] = None
for i in range(0, len(startups_df)):
    topic_str = "Topic" + str(topics2[i])
    startups_df.loc[i, 'Predicted_Topic_2'] = topic_str

In [78]:
startups_df['Predicted_Topic_2'].value_counts()

Topic7    11304
Topic2     8330
Topic4     7238
Topic3     6905
Topic5     4958
Topic8     4863
Topic0     4452
Topic9     4313
Topic6     4022
Topic1     3704
Name: Predicted_Topic_2, dtype: int64

In [79]:
#Finding the most similar topic to the given industry2 "Computer Software and Services"
grouped_counts_t2 = startups_df.groupby(['Industry2', 'Predicted_Topic_2']).size().reset_index(name='Counts')
total_CSS_counts = grouped_counts_t2[grouped_counts_t2['Industry2'] == 'Computer Software and Services']['Counts'].sum()
grouped_counts_t2['Proportion'] = grouped_counts_t2.apply(lambda row: row['Counts'] / total_CSS_counts if row['Industry2'] == 'Computer Software and Services' else None, axis=1)
print(grouped_counts_t2[grouped_counts_t2['Industry2'] == 'Computer Software and Services']) 
#From the below table it is clear that Topic7 is the most similar to industry2 "Computer Software and Services"

                         Industry2 Predicted_Topic_2  Counts  Proportion
30  Computer Software and Services            Topic0    1091    0.070647
31  Computer Software and Services            Topic1     784    0.050767
32  Computer Software and Services            Topic2    3428    0.221978
33  Computer Software and Services            Topic3     277    0.017937
34  Computer Software and Services            Topic4     350    0.022664
35  Computer Software and Services            Topic5     363    0.023506
36  Computer Software and Services            Topic6     159    0.010296
37  Computer Software and Services            Topic7    7286    0.471800
38  Computer Software and Services            Topic8    1005    0.065078
39  Computer Software and Services            Topic9     700    0.045328


In [80]:
#calculating precision and recall of industry2 "Computer Software and Services"

tp2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] == 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] == 'Topic7')]['Counts'].sum()#True Postives calculation
fp2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] != 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] == 'Topic7')]['Counts'].sum()
fn2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] == 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] != 'Topic7')]['Counts'].sum()
tn2 = grouped_counts_t2[(grouped_counts_t2['Industry2'] != 'Computer Software and Services') & (grouped_counts_t2['Predicted_Topic_2'] != 'Topic7')]['Counts'].sum()

print("True Positives = ", tp2)
print("False Positives = ",fp2)
print("False Negatives = ",fn2)
print("True Negatives = ",tn2)
print("TP+TN+FP+TN = ", tp2+tn2+fn2+fp2)

#calculate precision
precision2 = tp2/(tp2+fp2)
#calculate recall
recall2 = tp2/(tp2+fn2)

print("\n10-topic modelling with LDA\n")
print("Precision (for 10-topic LDA) = ", precision2)
print("Recall (for 10-topic LDA) = ", recall2)

True Positives =  7286
False Positives =  4018
False Negatives =  8157
True Negatives =  40628
TP+TN+FP+TN =  60089

10-topic modelling with LDA

Precision (for 10-topic LDA) =  0.644550601556971
Recall (for 10-topic LDA) =  0.4717995208184938


In [81]:
# most important words (top 10) for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
company product technology system medical development manufacture develops research treatment 

Topic 1: 
company service operates engaged business online provides product offer chinabased 

Topic 2: 
company software solution platform data provides service user application united 



### GridSearch CV with LDA

#### For 3-topic modelling

In [26]:
start_time = time.time()
search_params = {'n_components': [5, 10, 15, 30], 'learning_decay': [.5, .9]}

lda = LatentDirichletAllocation(n_components=3,               # Number of topics
                                max_iter=25,               # Max learning iterations
                                learning_method='online',   
                                random_state=100,          # Random state
                                batch_size=128,            # n docs in each learning iter
                                evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                n_jobs = -1,               # Use all available CPUs
                                )

model = GridSearchCV(lda, param_grid=search_params)
model.fit(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))



--- 4269.514829158783 seconds ---


In [28]:
# Best Model
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(vect_text))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -334429.1910823734
Model Perplexity:  934.2787310307783


#### For 10-topic modelling

In [29]:
start_time = time.time()
search_params = {'n_components': [5, 10, 15, 30], 'learning_decay': [.5, .9]}

lda = LatentDirichletAllocation(n_components=10,               # Number of topics
                                max_iter=10,               # Max learning iterations
                                learning_method='online',   
                                random_state=100,          # Random state
                                batch_size=128,            # n docs in each learning iter
                                evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                n_jobs = -1,               # Use all available CPUs
                                )

model = GridSearchCV(lda, param_grid=search_params)
model.fit(vect_text)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1033.7638320922852 seconds ---


In [30]:
# Best Model
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(vect_text))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -334578.2370875835
Model Perplexity:  937.9188838850542


3-topic modelling has better model perplexity.