In [None]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("processed_reddit_posts.csv")

# Inspect dataset structure
df.info()
df.head()

# Convert timestamps to datetime
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Remove duplicates & NaN values
df = df.drop_duplicates().dropna()


In [None]:

df.set_index("created_utc")['id'].resample('D').count().plot(figsize=(12,5), title="Post Frequency Over Time")
plt.show()


In [None]:
df['author'].value_counts().head(10).plot(kind='bar', title="Top 10 Contributors")
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub(r"[^a-zA-Z ]", "", text.lower())  # Remove special characters
    return " ".join([word for word in text.split() if word not in stop_words])

df['cleaned_text'] = df['body'].astype(str).apply(clean_text)


In [None]:
wordcloud = WordCloud(width=800, height=400).generate(" ".join(df['cleaned_text']))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Most Used Words in r/Anarchism")
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_features=500, stop_words="english")
X = vectorizer.fit_transform(df['cleaned_text'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display Topics
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx+1}: ", [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


In [None]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

# Visualize Sentiment Distribution
sns.histplot(df['sentiment'], bins=30, kde=True)
plt.title("Sentiment Distribution in r/Anarchism")
plt.show()


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news")

df['fake_news_prob'] = df['cleaned_text'].apply(lambda x: classifier(x)[0]['score'])
