In [27]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

In [28]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
# Load the CSV file
df = pd.read_csv('../data/universitas_brawijaya_scholar_results2.csv')

In [30]:
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove "[PDF]" or "[pdf]" using regular expression (case-insensitive)
    text = re.sub(r'\[pdf\]', '', text, flags=re.IGNORECASE)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [31]:
# Apply the preprocessing function to the 'Title' column
df['Processed_Title'] = df['Title'].apply(preprocess_text)

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')

# Fit and transform the processed text data
dtm = vectorizer.fit_transform(df['Processed_Title'])

In [32]:
# Initialize the LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)

# Fit the LDA model on the document-term matrix
lda.fit(dtm)

# Display the top words for each topic
for index, topic in enumerate(lda.components_):
    print(f'Top 10 words for Topic #{index}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for Topic #0:
['yield', 'effect', 'stock', 'study', 'soil', 'tourism', 'east', 'development', 'java', 'indonesia']


Top 10 words for Topic #1:
['use', 'quality', 'growth', 'consumer', 'tourism', 'brand', 'perceived', 'intention', 'study', 'effect']


Top 10 words for Topic #2:
['strategy', 'used', 'learning', 'language', 'program', 'universitas', 'english', 'brawijaya', 'study', 'student']


Top 10 words for Topic #3:
['indonesia', 'community', 'area', 'change', 'main', 'study', 'character', 'land', 'analysis', 'movie']


Top 10 words for Topic #4:
['plant', 'malang', 'land', 'organic', 'potential', 'mining', 'case', 'study', 'using', 'soil']




In [33]:
# Assign the most relevant topic to each document
topic_results = lda.transform(dtm)
df['Topic'] = topic_results.argmax(axis=1)

# Display the first few rows of the dataframe with assigned topics
print(df[['Title', 'Topic']].head())

                                               Title  Topic
0  [PDF] Management of Public Information Disclos...      2
1  [PDF] Director Rector of Universitas Brawijaya...      2
2  Refusal Strategies Used By Male And Female Stu...      2
3  Achieving World Class University Through Inter...      2
4  the Translation of Demonstrative References fr...      2
