In [7]:
import pandas as pd
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [2]:
file_path='../raw_data/fake_posts.csv'
df = pd.read_csv(file_path)

In [3]:
df['ID'] = range(1, len(df) + 1)
df.set_index('ID', inplace=True)

In [5]:
df['Title']=df['Title'].str.strip()
df['Content']=df['Content'].str.strip()

In [12]:


def clean (text):

    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned



In [13]:
# Apply to all texts
df['clean_text'] = df['Content'].apply(clean)

df.head()

Unnamed: 0_level_0,Title,Content,clean_text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Let's Talk about the Importance of Foot Care,"Hey there health enthusiasts! Today, let's cha...",hey health enthusiast today let chat often neg...
2,Random Health Benefits of Practicing Tai Chi,"Hey guys, it's your favorite health and lifest...",hey guy favorite health lifestyle enthusiast t...
3,Get Your Heart Pumping with These Seasonal Fit...,"Hey there, fitness enthusiasts! Today, I'm her...",hey fitness enthusiast today talk importance s...
4,The Power of Herbal Teas for Optimal Health,"Hey health enthusiast friends! Today, I want t...",hey health enthusiast friend today want talk i...
5,The Ins and Outs of Proper Handwashing,"Yo yo yo, what up my peeps? Today I wanna talk...",yo yo yo peep today wan na talk somethin super...


In [15]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(df['clean_text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [16]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [17]:
print_topics(lda_model, vectorizer)

Topic 0:
[('health', 1571.9079284214429), ('let', 943.8808683084912), ('heart', 827.7921077544836), ('care', 810.1997981890638), ('level', 687.9097778169789), ('keep', 622.5145104056472), ('foot', 596.0127562914278), ('stress', 580.467724431417), ('stay', 574.0762165089029), ('help', 567.63692307989)]
Topic 1:
[('health', 2227.092071578522), ('help', 1449.363076920077), ('bone', 1102.391224668645), ('also', 1010.5159810447991), ('heart', 990.207892245484), ('improve', 907.221784510356), ('joint', 832.7284418103694), ('level', 788.0902221829889), ('way', 727.0540438457841), ('overall', 716.6390338664414)]
