## Question_3_Solution:

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Load the CSV file containing the text data
df = pd.read_csv('data.csv')

# Preprocess the text data
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = ''.join([c for c in text if c.isalpha() or c.isspace()]).lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

df['processed_text'] = df['text'].apply(preprocess_text)

# Perform keyword extraction using TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['processed_text'])

# Get the most important keywords for each document
top_keywords = []
for i in range(len(df)):
    feature_names = tfidf.get_feature_names()
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    top_indices = tfidf_scores.argsort()[-5:][::-1]  # Get top 5 keywords
    keywords = [feature_names[idx] for idx in top_indices]
    top_keywords.append(', '.join(keywords))

df['keywords'] = top_keywords

# Perform topic modeling using Latent Dirichlet Allocation (LDA)
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_matrix = lda.fit_transform(tfidf_matrix)

# Get the top words for each topic
feature_names = tfidf.get_feature_names()
top_words = []
for topic_idx, topic in enumerate(lda.components_):
    top_indices = topic.argsort()[-10:][::-1]  # Get top 10 words
    words = [feature_names[idx] for idx in top_indices]
    top_words.append(', '.join(words))

# Print the topics and their associated top words
for topic_idx, words in enumerate(top_words):
    print(f'Topic {topic_idx + 1}: {words}')
