## **Q3**

In [14]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models

# Set up NLTK
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Load text from CSV
def load_text_from_csv(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row
        text = ' '.join(row[0] for row in reader)
    return text

# Preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic characters, and convert to lowercase
    words = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

    return words

# Perform keyword extraction
def extract_keywords(text, num_keywords=5):
    freq_dist = nltk.FreqDist(text)
    keywords = [word for word, _ in freq_dist.most_common(num_keywords)]
    return keywords

# Perform topic modeling
def perform_topic_modeling(text, num_topics=5):
    # Create dictionary and corpus
    dictionary = corpora.Dictionary([text])
    corpus = [dictionary.doc2bow(text)]

    # Perform LDA topic modeling
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

    topics = []
    for topic in lda_model.print_topics(num_topics=num_topics):
        topics.append(topic[1])

    return topics

# Main program
if __name__ == '__main__':
    # Load text from CSV
    csv_file = 'output.csv'
    text = load_text_from_csv(csv_file)

    # Preprocess text
    preprocessed_text = preprocess_text(text)

    # Perform keyword extraction
    keywords = extract_keywords(preprocessed_text, num_keywords=5)
    print('Keywords:', keywords)

    # Perform topic modeling
    topics = perform_topic_modeling(preprocessed_text, num_topics=5)
    print('Topics:')
    for i, topic in enumerate(topics):
        print(f'Topic {i+1}: {topic}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Keywords: ['text', 'boring', 'file', 'simple', 'pdf']
Topics:
Topic 1: 0.515*"text" + 0.054*"boring" + 0.041*"file" + 0.028*"pdf" + 0.028*"continued" + 0.028*"simple" + 0.028*"page" + 0.015*"use" + 0.015*"even" + 0.015*"virtual"
Topic 2: 0.041*"text" + 0.040*"file" + 0.040*"boring" + 0.040*"page" + 0.040*"continued" + 0.040*"simple" + 0.040*"pdf" + 0.040*"watching" + 0.040*"mechanics" + 0.040*"demonstration"
Topic 3: 0.040*"text" + 0.040*"file" + 0.040*"boring" + 0.040*"simple" + 0.040*"page" + 0.040*"pdf" + 0.040*"continued" + 0.040*"tutorials" + 0.040*"oh" + 0.040*"watching"
Topic 4: 0.040*"text" + 0.040*"boring" + 0.040*"file" + 0.040*"page" + 0.040*"continued" + 0.040*"pdf" + 0.040*"simple" + 0.040*"little" + 0.040*"tutorials" + 0.040*"yet"
Topic 5: 0.040*"text" + 0.040*"file" + 0.040*"boring" + 0.040*"page" + 0.040*"simple" + 0.040*"pdf" + 0.040*"continued" + 0.040*"watching" + 0.040*"tutorials" + 0.040*"yet"
