In [20]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    stop_words.add("embeddedurl")
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens


def vectorize(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['clean_text'])
    return X


def cluster(df, X, n_clusters):
    cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='complete')
    df['cluster_label'] = cluster.fit_predict(X.toarray())
    return df


def main(file_path, num_clusters):
    df = pd.read_csv(file_path)
    df = df.head(100)
    df = df[(df['clean_text'].str.contains('spirituality', case=False)) & 
            (~df['clean_text'].str.contains('religion', case=False))]
    df['tokenized_text'] = df['clean_text'].apply(preprocess_text)

    X = vectorize(df)
    df = cluster(df=df, X=X, n_clusters=num_clusters)


    os.makedirs('output', exist_ok=True)
    df.to_csv(os.path.join('output', 'ac_10_topics.csv'), index=False)



if __name__ == "__main__":
    file_path = "/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/data/full_year.csv"
    num_clusters = 10
    main(file_path, num_clusters)


[nltk_data] Downloading package punkt to /Users/shtosti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shtosti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shtosti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [13]:
df = pd.read_csv("./output/ac_10_topics.csv")
df[['clean_text', 'tokenized_text', 'cluster_label']].head()

Unnamed: 0,clean_text,tokenized_text,cluster_label
0,attayyiby barackobama youtube the phrase that ...,"['attayyiby', 'barackobama', 'youtube', 'phras...",0
1,"maarblek without religion or sports, people fi...","['maarblek', 'without', 'religion', 'sport', '...",1
2,biggestrevelationof2021 romain roland predicti...,"['romain', 'roland', 'prediction', 'saint', 'r...",0
3,crime/politics/religion pays ... embeddedurl,['pay'],0
4,"uh oh, the ""religion of peace"" is at it again....","['uh', 'oh', 'religion', 'peace', 'hey', 'pale...",8
