In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
/kaggle/input/alldatafiltered/AllDataFilteredNoBody.csv
/kaggle/input/alldatafiltered/AllDataFiltered.csv
/kaggle/input/notcombined/r_udub_posts_filtered.csv


In [2]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


In [3]:
df_raw = pd.read_csv("/kaggle/input/notcombined/r_udub_posts_filtered.csv")
df = df_raw.copy()

df

Unnamed: 0.1,Unnamed: 0,title,selftext,link_flair_text,id,url,num_comments,score,bodyTitle
0,28049,r/udub mods,,Meme,g3g8ab,https://i.redd.it/d4wj47suqht41.png,6,342,r/udub mods
1,28050,UW Nursing Frustration,If any one here is thinking of applying to the...,,g3g9sa,https://www.reddit.com/r/udub/comments/g3g9sa/...,16,61,UW Nursing Frustration If any one here is thin...
2,28051,Does anyone know if parking is free on campus ...,,,g3i0rg,https://www.reddit.com/r/udub/comments/g3i0rg/...,2,2,Does anyone know if parking is free on campus ...
3,28052,"Not the OP, but posting this again",,,g3i63f,https://i.redd.it/hkq6gklzgit41.png,29,163,"Not the OP, but posting this again"
4,28053,Help deciding between UW engineering and anoth...,Hey everybody! I was admitted as a DTC enginee...,,g3ievx,https://www.reddit.com/r/udub/comments/g3ievx/...,17,6,Help deciding between UW engineering and anoth...
...,...,...,...,...,...,...,...,...,...
47054,75103,Anyone going to Seattle International Film Fes...,Anyone planning on attending any screenings th...,Discussion,1c8362f,https://www.reddit.com/r/udub/comments/1c8362f...,0,1,Anyone going to Seattle International Film Fes...
47055,75104,Is taking Physics and ochem at the same time a...,i’m debating to take physics and ochem togethe...,,1c87cme,https://www.reddit.com/r/udub/comments/1c87cme...,0,1,Is taking Physics and ochem at the same time a...
47056,75105,Microbio and Weed-out classes,I know this subreddit is probably being floode...,Advice,1c8a9zh,https://www.reddit.com/r/udub/comments/1c8a9zh...,0,1,Microbio and Weed-out classes I know this subr...
47057,75106,Did I miss the window for a house rental for t...,My friend and I are month-to-month right now a...,,1c8aw8t,https://www.reddit.com/r/udub/comments/1c8aw8t...,0,1,Did I miss the window for a house rental for t...


In [4]:
text_columns = ["combined_text"]

df_raw = pd.read_csv("/kaggle/input/alldatafiltered/AllDataFilteredNoBody.csv")
df = df_raw.copy()
df["combined_text"] = df["combined_text"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, stopwords.words("english")))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (9031, 2)
Pre-processed dataframe: (9012, 2)


In [5]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [6]:
model.wv.most_similar("math")

[('series', 0.9823327660560608),
 ('physics', 0.9782159924507141),
 ('cse', 0.9718117713928223),
 ('phys', 0.9675642848014832),
 ('intro', 0.9665501713752747),
 ('bio', 0.9648613929748535),
 ('gen', 0.9615781903266907),
 ('stat', 0.956072986125946),
 ('econ', 0.9558734893798828),
 ('calc', 0.9534657001495361)]

In [7]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])



(9012, 100)

In [8]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [9]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=11,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})



For n_clusters = 11
Silhouette coefficient: 0.11
Inertia:7243.462248700706
Silhouette values:
    Cluster 7: Size:1327 | Avg:0.16 | Min:0.00 | Max: 0.34
    Cluster 8: Size:1038 | Avg:0.15 | Min:-0.11 | Max: 0.40
    Cluster 2: Size:867 | Avg:0.15 | Min:-0.09 | Max: 0.37
    Cluster 9: Size:896 | Avg:0.13 | Min:-0.03 | Max: 0.33
    Cluster 0: Size:929 | Avg:0.12 | Min:-0.10 | Max: 0.35
    Cluster 5: Size:616 | Avg:0.11 | Min:-0.12 | Max: 0.35
    Cluster 4: Size:1005 | Avg:0.10 | Min:-0.05 | Max: 0.28
    Cluster 6: Size:899 | Avg:0.10 | Min:-0.14 | Max: 0.35
    Cluster 1: Size:503 | Avg:0.04 | Min:-0.17 | Max: 0.26
    Cluster 10: Size:510 | Avg:0.04 | Min:-0.14 | Max: 0.29
    Cluster 3: Size:422 | Avg:0.02 | Min:-0.19 | Max: 0.25


In [10]:
print("Most representative terms per cluster (based on centroids):")
for i in range(11):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: wanting pmp internships accepting transfers 
Cluster 1: normal four til deans lockdown 
Cluster 2: teaching phys121 python heavy prepare 
Cluster 3: vlpa retaking ed engr fulfill 
Cluster 4: potentially frustrated anxiety hella wish 
Cluster 5: premed pursuing neuro ce psychology 
Cluster 6: udistrict commuting closer furnished cheapest 
Cluster 7: guess happy anxious knowing motivated 
Cluster 8: bioe polisci affect initially hopefully 
Cluster 9: retake core doable mgmt qsci 
Cluster 10: particular everybody suggestion leader opinions 


In [11]:
test_cluster = 5
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(docs[d])
    print("-------------")

human centered design and engineering and pre-pa track? i’m an incoming freshman ( i also have college credits from running start) and i’m still figuring stuff out. currently, hcde has caught my eye and i’m still interested in medicine. how hard is it to get into hcde? any advice? do people do engineering discipline and pre-health track?
-------------
informatics major admission? hi :) so i’m looking to apply for informatics in the upcoming cycle and i’m nervous because it’s getting competitive and this was my first quarter at uw so my gpa is down a bit in addition to mental health reasons. i also came in with about 99 credits from running start so i got most of the prerequisites covered besides cs and stats. what advice would you have to make my application standout for the admissions committee? any clubs? programs? general advice? if you applied and got in, what did you do for your application?
-------------
electrical engineering workload? i’m a high school senior applying to uw. i’