In [1]:
# Import libraries
import pandas as pd
import string
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier

import gensim.utils
from gensim.models import Word2Vec

import pickle

import praw
from praw.models import MoreComments

In [2]:
# Import data
df = pd.read_csv('../data/reddits.csv')

df.head()

Unnamed: 0,title,post_text,id,score,total_comments,post_url,subreddit,post_type,title_&_text,title_text_stemmed,title_text_lemmatized,trending
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16o7z6r,1,2,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,Daily Fasting Check-in! * **Type** of fast (wa...,"['daili', 'checkin', 'type', 'fast', 'water', ...","['daily', 'checkin', 'type', 'fast', 'water', ...",2
1,I decided who I wanted to be and I became her 💅🏽,"So a little background: I’m 39, have birthed t...",16ntqoy,1176,36,https://i.redd.it/fclkjnwhmgpb1.jpg,intermittentfasting,hot,I decided who I wanted to be and I became her ...,"['decid', 'want', 'becam', 'littl', 'backgroun...","['decided', 'wanted', 'became', 'little', 'bac...",42336
2,Some photos from a past vacation came up as a ...,I remember being miserable and insecure the en...,16ni914,1505,77,https://www.reddit.com/gallery/16ni914,intermittentfasting,hot,Some photos from a past vacation came up as a ...,"['photo', 'past', 'vacat', 'came', 'memori', '...","['photo', 'past', 'vacation', 'came', 'memory'...",115885
3,"Anybody find IF, lose weight, and then lose mo...",I know I am an idiot.,16nuqx9,198,78,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,"Anybody find IF, lose weight, and then lose mo...","['anybodi', 'find', 'lose', 'weight', 'lose', ...","['anybody', 'find', 'lose', 'weight', 'lose', ...",15444
4,2 and a half months of IF,From 234 to 211 in 2.5 months. It works! Once ...,16nuxqs,180,12,https://i.redd.it/30yqmtsdvgpb1.jpg,intermittentfasting,hot,2 and a half months of IF From 234 to 211 in 2...,"['2', 'half', 'month', '234', '211', '25', 'mo...","['2', 'half', 'month', '234', '211', '25', 'mo...",2160


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2489 entries, 0 to 2488
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   title                  2489 non-null   object
 1   post_text              2489 non-null   object
 2   id                     2489 non-null   object
 3   score                  2489 non-null   int64 
 4   total_comments         2489 non-null   int64 
 5   post_url               2489 non-null   object
 6   subreddit              2489 non-null   object
 7   post_type              2489 non-null   object
 8   title_&_text           2489 non-null   object
 9   title_text_stemmed     2489 non-null   object
 10  title_text_lemmatized  2489 non-null   object
 11  trending               2489 non-null   int64 
 12  subreddit_binarized    2489 non-null   int64 
dtypes: int64(4), object(9)
memory usage: 252.9+ KB


In [3]:
# Binarize 'subreddit' for modelling
# 'AnorexiaNervosa' = 0
# 'intermittentfasting' = 1
df['subreddit_binarized'] = df['subreddit'].map({'AnorexiaNervosa': 0, 'intermittentfasting': 1})
df['subreddit_binarized'].head()

0    1
1    1
2    1
3    1
4    1
Name: subreddit_binarized, dtype: int64

In [4]:
# Assemble features (X) and target (y)
X = df['title_text_stemmed'].tolist()
y = df['subreddit_binarized'].tolist()

In [91]:
# Vectorize data using TF-IDF then train model using Bernoulli

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Instantiate TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Instantiate the Bernoulli model
BernNB = BernoulliNB(binarize=0.1)

# Fit the model
BernNB.fit(X_train_resampled, y_train_resampled)

# Make predictions on both training and test data
y_pred_train = BernNB.predict(X_train_resampled)
y_pred_test = BernNB.predict(X_test_tfidf)


In [92]:
# Print train and test scores
print(accuracy_score(y_test, y_pred_test))

0.9397590361445783


In [93]:
# Save the trained model to the pickle file
with open('../data/bernoulli_model.pkl', 'wb') as model_file:
    pickle.dump(BernNB, model_file)

In [94]:
# Save the vectorizer to the pickle file
with open('../data/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

In [8]:
# Define Reddit API credentials
reddit_client_id = "-rUx3v29zVVe7aMPZtnPCA"
reddit_client_secret = "rNQ7a89ilfDRLSAPEZ-3tmB9ZgwScA"
reddit_user_agent = "39 SIR Scraper"

# Initialize the Reddit API
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=reddit_user_agent
)

In [29]:
# Function to scrape Reddit user's posts
def scrape_reddit_user_posts(username, num_posts=100):
    posts_dict = {
        "title": [],
        "post_text": [],
    }
    
    try:
        # Get the Reddit user instance
        user = reddit.redditor(username)

        # Iterate through the user's submissions (posts)
        for submission in user.submissions.top(limit=num_posts):
            # Append the title of each post to the list
            posts_dict['title'].append(submission.title)
            posts_dict['post_text'].append(submission.selftext)
        # Convert the dict to a dataframe
        posts_dict_df = pd.DataFrame(posts_dict)
        return posts_dict_df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [45]:
# Run the scrapping function on the user
name = "Jon_Henderson_Music"
user_posts = scrape_reddit_user_posts(name, num_posts=100)
user_posts.head()

Scrapped 47 posts from Jon_Henderson_Music


Unnamed: 0,title,post_text
0,"IF, keto, and exercise. I'm taking my life back.",
1,The difference a year can make.,Intermittent fasting changed my life. This com...
2,1.5 pounds away from goal! So thankful for thi...,Could barely fit in those shorts in January. N...
3,Have Boomers become absentee grandparents?,"I get texts from my Mom from time to time, ""wh..."
4,M/34/5’7” [205lbs to 155lbs] (8 months) Taking...,Trying to get back to my high school tennis we...


In [61]:
# Function to clean data

def clean_data(text_df):
    # Merge title and post_text columns
    text_df['title_text'] = text_df['title'] + ' ' + text_df['post_text']

    # Remove rows with null values in the 'title_text' column
    text_df.dropna(subset=['title_text'], inplace=True)

    # Remove punctuations and tokenize using the built in cleaner in gensim
    text_df['title_text'] = text_df['title_text'].apply(lambda x: gensim.utils.simple_preprocess(x))

    # Spply stemming and stopwords exclusion within the same step
    stopwords = nltk.corpus.stopwords.words('english')
    ps = nltk.PorterStemmer()
    for idx in text_df.index:
        text_df['title_text'][idx] = [ps.stem(word) for word in text_df['title_text'][idx] if word not in stopwords]

    return text_df

In [95]:
# Clean the user posts using the cleaning function
clean_data(user_posts)
user_posts.head()

Unnamed: 0,title,post_text,title_text
0,"IF, keto, and exercise. I'm taking my life back.",,"[keto, exercis, take, life, back]"
1,The difference a year can make.,Intermittent fasting changed my life. This com...,"[differ, year, make, intermitt, fast, chang, l..."
2,1.5 pounds away from goal! So thankful for thi...,Could barely fit in those shorts in January. N...,"[pound, away, goal, thank, commun, motiv, gave..."
3,Have Boomers become absentee grandparents?,"I get texts from my Mom from time to time, ""wh...","[boomer, becom, absente, grandpar, get, text, ..."
4,M/34/5’7” [205lbs to 155lbs] (8 months) Taking...,Trying to get back to my high school tennis we...,"[lb, lb, month, take, life, back, tri, get, ba..."


In [109]:
# Make prediction using the pickle file

# Load the trained model and vectorizer from pickle files
with open('../data/bernoulli_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('../data/tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

# Vectorize the new data using the same vectorizer used during training
X_new = loaded_vectorizer.transform(user_posts['title_text'].apply(lambda x: ', '.join(x)))

# Make predictions on the new data
predictions = loaded_model.predict(X_new)

# Get probability estimates for the predictions
confidence_scores = loaded_model.predict_proba(X_new)

# Count the occurrences of each class label in the predictions
counts = np.bincount(predictions)

# Calculate the weighted average of confidence scores
weighted_confidence_scores = np.zeros_like(confidence_scores[0])
for i, prediction in enumerate(predictions):
    weighted_confidence_scores += confidence_scores[i] * (1 / counts[prediction])

# Define the class labels
class_labels = ['r/AnorexiaNervosa', 'r/intermittentfasting']

# Determine the consolidated prediction label
consolidated_prediction_label = class_labels[np.argmax(counts)]

# Combine the consolidated prediction label and the weighted confidence scores
consolidated_prediction = {
    "label": consolidated_prediction_label,
    "confidence_scores": weighted_confidence_scores.tolist()
}

print(f"Predicted subreddit: {consolidated_prediction_label}")
print(f"Weighted confidence score: {weighted_confidence_scores.tolist()[0]}")


Predicted subreddit: r/intermittentfasting
Weighted confidence score: 0.8976315828739463


In [142]:
print(accuracy_score(predictions, y_test))

ValueError: Found input variables with inconsistent numbers of samples: [47, 498]