In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re


nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('./mining/reddit_clean.csv')
df['text'] = df['title'].fillna('') + ' ' + df['selftext'].fillna('')

df = df[df['created_utc'] >= 1677654056]

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Removing stopwords, special characters and applying stemming & lemmatization
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [word.lower() for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

new_df = df.copy()
new_df = new_df[new_df['subreddit'] != 'Vancouver4Friends']

new_df['clean_text'] = df['text'].apply(preprocess_text)
new_df['token_count'] = new_df['clean_text'].apply(lambda x: len(x.split()))
print(len(new_df))
new_df = new_df[new_df['token_count'] >= 10]
print(len(new_df))

# Compute similarity matrix and remove highly similar rows
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(new_df['clean_text'])
cosine_sim = cosine_similarity(tfidf_matrix)

threshold = 0.80  # You can adjust this value based on your preference
to_drop = set()

for i in range(len(new_df)):
    for j in range(i+1, len(new_df)):
        if cosine_sim[i][j] > threshold:
            to_drop.add(j)

new_df = new_df.drop(new_df.index[list(to_drop)])

print(len(new_df))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alireza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alireza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2529
1770
1573


In [3]:
# LDA

import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

nltk.download('stopwords')  # Download stopwords
nltk.download('wordnet')  # Download wordnet for lemmatization
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Removing stopwords, special characters and applying stemming & lemmatization
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [word.lower() for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

new_df['lda_text'] = df['text'].apply(preprocess_text)
new_df['lda_token_count'] = new_df['lda_text'].apply(lambda x: len(x.split()))
print(len(new_df))
# new_df = new_df[new_df['lda_token_count'] >= 50]
print(len(new_df))
new_df['tokens'] = new_df['lda_text'].apply(lambda x: [word for word in word_tokenize(x.lower()) if word.isalnum() and word not in stop_words])



  "class": algorithms.Blowfish,
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alireza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alireza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1573
1573


In [None]:
dictionary = Dictionary(new_df['tokens'])
corpus = [dictionary.doc2bow(text) for text in new_df['tokens']]

# Finding the optimal number of topics
coherence_values = []
model_list = []
topic_nums = range(2, 16)  # Adjust the range as needed

for num_topics in topic_nums:
    model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=new_df['tokens'], dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())


In [10]:
coherence_values

[0.6017079223800166,
 0.519359911718126,
 0.4869570486843995,
 0.44717692735490244,
 0.4199659256472317,
 0.41830788306885186,
 0.39684964538502177,
 0.3920011432140865,
 0.3765014449085136,
 0.381613576990825,
 0.362341660998913,
 0.36546098750441275,
 0.41274037054343155,
 0.3804055311495296]

In [5]:
from sklearn.decomposition import NMF
import numpy as np

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(new_df['lda_text'])

# NMF Model
n_topics = range(10, 11)  # Adjust the range as needed
for n in n_topics:
    nmf = NMF(n_components=n, random_state=1, init='nndsvd')
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    new_df['topic'] = np.argmax(W, axis=1)  # Assigns the topic with the highest weight for each document
    
    # Display topics and words (for demonstration)
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(H):
        print(f"Topic #{topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

        # Get the top 3 documents for this topic
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:3]  # Sort the weights for the given topic in descending order and get the indices for the top 3 documents
        print("Top 3 documents for this topic:")
        
        for doc_index in top_doc_indices:
            print(f"Document #{doc_index}:")
            print(new_df.iloc[doc_index]['text'])  # Replace 'text' with whatever the name of the column containing your text data is
        print("------")

Topic #0
william lake tribun wildfir depart near chilcotin countri respond mycariboonowcom
Top 3 documents for this topic:
Document #1361:
williams lake has a new firessmart coordinator - williams lake tribune - williams lake tribune  
Document #1497:
wildfire near williams lake being actioned by bc wildfire services - williams lake tribune - williams lake tribune  
Document #1532:
b.c. wildfire responding to wildfire north of clinton - williams lake tribune - williams lake tribune  
------
Topic #1
im like know look peopl thank anyon water time dont
Top 3 documents for this topic:
Document #419:
how to find safe/affordable housing for a senior who might resist? advice desperately needed, please. my father is a proud man in his mid-seventies in poor health. he has heart issues as well as some mobility issues (can walk, but has fallen and been unable to get up). he owns his own home on vancouver island in a smaller suburb, but it's in terrible shape as he is in no condition to keep the 

In [7]:
from sklearn.decomposition import NMF
import numpy as np

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(new_df['lda_text'])

# NMF Model
n = 30

nmf = NMF(n_components=n, random_state=1, init='nndsvd')
W = nmf.fit_transform(tfidf)
H = nmf.components_
new_df['topic_30'] = np.argmax(W, axis=1)  # Assigns the topic with the highest weight for each document

# Display topics and words (for demonstration)
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(H):
    print(f"Topic #{topic_idx}")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

    # Get the top 3 documents for this topic
    top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:3]  # Sort the weights for the given topic in descending order and get the indices for the top 3 documents
    print("Top 3 documents for this topic:")
    
    for doc_index in top_doc_indices:
        print(f"Document #{doc_index}:")
        print(new_df.iloc[doc_index]['text'])  # Replace 'text' with whatever the name of the column containing your text data is
    print("------")

Topic #0
tribun william lake wildfir near chilcotin west hectar firesmart held
Top 3 documents for this topic:
Document #1361:
williams lake has a new firessmart coordinator - williams lake tribune - williams lake tribune  
Document #1497:
wildfire near williams lake being actioned by bc wildfire services - williams lake tribune - williams lake tribune  
Document #1532:
b.c. wildfire responding to wildfire north of clinton - williams lake tribune - williams lake tribune  
------
Topic #1
peopl help build need work hous dont use volunt live
Top 3 documents for this topic:
Document #193:
someone has been trying to burn my building down this evening (crosstown/chinatown)- a fire has been put out twice i live at the cross-section of crosstown and chinatown. this evening someone has tried to light my building on fire twice. fortunately, one of the building’s tenants was outside and noticed, but as you can see, the person is quite motivated. there is no good reason for wooden pallets and car



In [8]:
new_df.to_csv('./reddit_topic.csv', index=False)