In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import json

### Jsonl Cleaning and Jsonl -> Dataframe -> CSV

In this code I created a method which utalizes the jsonl file cade pulled from pushshift and basically goes through all the different values and pulls out the necessary values we'll be using within our model and some other necessary information for data exploration

In [None]:
def parse_jsonl_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                entry = json.loads(line.strip())
                title = entry.get('title')
                selftext = entry.get('selftext')
                link_flair_text = entry.get('link_flair_text')
                post_id = entry.get('id')
                url = entry.get('url')
                num_comments = entry.get('num_comments')
                score = entry.get('score')

                if link_flair_text == 'None':
                    continue
                
                data.append({
                    'title': title,
                    'selftext': selftext,
                    'link_flair_text': link_flair_text,
                    'id': post_id,
                    'url': url,
                    'num_comments': num_comments,
                    'score': score
                    
                })
            except json.JSONDecodeError:
                print("Error decoding JSON line:", line)
    return data

file_path = 'r_udub_posts.jsonl'
parsed_data = parse_jsonl_file(file_path)

df_jsonl = pd.DataFrame(parsed_data)

df_jsonl.head(5)


This Part of the code moves the now created dataframe and transforms it into a CSV so it's easier to handle and allows for us to all work with the same Data Set

In [3]:
csv_file_path = 'r_udub_posts.csv'
df_jsonl.to_csv(csv_file_path, index=False)
print("DataFrame saved as CSV:", csv_file_path) 

NameError: name 'df_jsonl' is not defined

### How accuracy is a multinomial Naive Bayes Model for predicting different flairs?

In this case we wanted to try a fairly standard model to kinda get a baseline on one how our data works and get a starting point in which we can iterate and look back on.

In [4]:
posts = pd.read_csv('r_udub_posts.csv')
posts.head(5)

Unnamed: 0,title,selftext,link_flair_text,id,url,num_comments,score
0,Any UW redditors want to meet up Thursday 10/2...,"We failed on 10/22, but I think with a week of...",,9y4hg,https://www.reddit.com/r/udub/comments/9y4hg/a...,6,4
1,We need a UW-ified logo.,If someone here has arcane skill in the graphi...,,9ywtc,https://www.reddit.com/r/udub/comments/9ywtc/w...,2,3
2,Thursday bowling success!,[deleted],,9z66c,https://www.reddit.com/r/udub/comments/9z66c/t...,2,3
3,"Next UW meetup: Thursday, 11/5 at 11:00am in t...",This time we will be playing ping pong followe...,,a0ail,https://www.reddit.com/r/udub/comments/a0ail/n...,2,4
4,Next meetup: December 3rd. Need ideas,"Alright, so who is up for a December 3rd meetu...",,a9lq8,https://www.reddit.com/r/udub/comments/a9lq8/n...,7,2


In [5]:
posts['link_flair_text'].unique()

array([nan, 'PSA', 'Rant', 'Random', 'Meme', 'Question', 'Discussion',
       'Academics', 'Student Life', 'Help', 'Event', 'Video',
       'Admissions', 'Advice', 'Poll', 'poll', 'No unrelated posts'],
      dtype=object)

This next code is pretty much our main filter to bring our data from raw data into data which we can be fed into a model. In this case we combined the title and body text into one column so we can have more text tokenizer and utalize.Alongside this we also got rid of any posts which had not body, no flair or if the post is removed/deleted. Finally we removed any flairs not currently in use then lowercased them all to combine redundant flairs. 

In [31]:
posts['combined_text'] = posts['title'] + " " + posts['selftext'].fillna("")
flair_categories = ["admissions", "academics", "student life", "advice", "discussion", "meme", "rant", "psa", "event", "poll"]

flairedNotSelf = posts[(posts['link_flair_text'].notnull()) & (posts['selftext'] != '[removed]') & (posts['selftext'] != '[deleted]') & posts['selftext'].notnull()]
ModelDataLower = flairedNotSelf.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)
ModelDataFiltered = ModelDataLower[ModelDataLower['link_flair_text'].isin(flair_categories)][['combined_text', 'link_flair_text']]

ModelDataFiltered.head()

Unnamed: 0,combined_text,link_flair_text
28066,thoughts on madrona? i have an emotional suppo...,discussion
28077,soc 222 anyone has took or taking soc222(socio...,academics
28080,betsy evans - ling/anth 233 does anyone have a...,academics
28083,tell me what you want from remote teaching? th...,discussion
28088,efs experience/thoughts/opinions! i just regis...,discussion


In [32]:
print(len(ModelDataFiltered))

9031


Found the distribution of flairs so we can understand the data better.

In [35]:
ModelDataFiltered.groupby('link_flair_text').size().sort_values(ascending=False)

link_flair_text
advice          2360
academics       2100
student life    1551
admissions       977
discussion       865
poll             464
rant             388
psa              138
event            128
meme              60
dtype: int64

In the code below we just did a standard 80/20 split and kept random state 52 to get the same split.

In [37]:
X_train, X_test, y_train, y_test = train_test_split(ModelDataFiltered['combined_text'], ModelDataFiltered['link_flair_text'], test_size=0.2, random_state=52)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

7224
1807
7224
1807


Finally we vectorized the data using TF-IDF in order to remove common words.

In [42]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

Then we trained the model using the sklearn multinomial naive bayes model

In [43]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.4043457723193198


In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   academics       0.67      0.52      0.59       470
  admissions       0.77      0.04      0.08       226
      advice       0.29      0.86      0.43       468
  discussion       0.00      0.00      0.00       202
       event       0.00      0.00      0.00        40
        meme       1.00      0.01      0.02       116
        poll       0.85      0.64      0.73        88
         psa       0.00      0.00      0.00        58
        rant       0.00      0.00      0.00        76
student life       0.55      0.38      0.45       373

    accuracy                           0.40      2117
   macro avg       0.41      0.24      0.23      2117
weighted avg       0.48      0.40      0.34      2117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- One limitations of this model are primarily due to the useage of a TF-IDF vectorization as because it utalizes a bag of works approach it ignores the context and order of words and thus can limit out models abiltiy to extract relationships between words.

- A second limitation of this approach is we have a data imbalance as some flairs are more commonly utalized and thus our model may be weigted more towards classifying those flairs

Going forward we plan on trying to utalize Word2Vec which is able to interpret context more easily and also we plan to discuss if we want to put data mininimum on the amount of flairs necessary 