In [None]:
# import required packages and libraries
import pandas as pd
import time
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [None]:
# import linkedin data csv file
df = pd.read_csv("/Users/apurva/Downloads/linkedin-instructor-cleaned.csv", encoding='ISO-8859-1')
df


In [None]:
# Function to calculate coherence score
def calculate_coherence(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Function to find the optimal number of topics
def find_optimal_topics(dictionary, corpus, texts, limit, start=2, step=3):
    model_list, coherence_values = calculate_coherence(dictionary, corpus, texts, limit, start, step)

    # Find the number of topics with the highest coherence score
    optimal_num_topics = start + (coherence_values.index(max(coherence_values)) * step)

    return optimal_num_topics

In [None]:
# Create a function to apply topic modeling with stopword removal and stemming
def apply_topic_modeling(text):
    # Check if the input is a non-null string
    if isinstance(text, str):
        # Remove repetitions of sentences
        sentences = re.split(r'[.!?]', text)
        unique_sentences = list(set(sentences))
        cleaned_text = ' '.join(unique_sentences)

        # Split the cleaned text into words
        words = re.findall(r'\b\w+\b', cleaned_text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word.lower() not in stop_words]
        
        # Apply stemming
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
        
        # Remove specific common words
        words = [word for word in words if word.lower() not in ['of', 'the', 'also']]
        
        # Create a Dictionary from the processed words
        dictionary = Dictionary([words])
        
        # Create a corpus from the processed words
        corpus = [dictionary.doc2bow([word]) for word in words]
        
        # Find the optimal number of topics
        optimal_num_topics = find_optimal_topics(dictionary, corpus, [words], limit=10, start=2, step=1)
        
        # Train the LDA model with the optimal number of topics
        lda_model = LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=10)
        
        # Return the topics and associated keywords
        return lda_model.print_topics(num_topics=optimal_num_topics, num_words=5)


In [None]:
# Apply the function to each entry in 'about_section_linkedin' 
df['lda_topics'] = df['about_section_linkedin'].apply(apply_topic_modeling)

# Display the updated DataFrame 
print(df)

In [None]:
# Load the results into csv file
df.to_csv('/Users/apurva/Downloads/Topic_Modelling.csv') 