# Do basic preprocessing & Cleaning before ingesting data into Solr
## Load data

In [1]:
import pandas as pd

# Load all-comments and all-posts csv
data_dir = 'data_backup'

comments_df = pd.read_csv(f'{data_dir}/all-comments.csv')
posts_df = pd.read_csv(f'{data_dir}/all-posts.csv')

In [2]:
posts_df.head()

Unnamed: 0,author,author_flair_text,clicked,created_utc,distinguished,edited,id,is_original_content,is_self,link_flair_text,...,over_18,permalink,saved,score,selftext,spoiler,stickied,title,upvote_ratio,url
0,[deleted],,,1648494000.0,,,tqghan,,,,...,,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,224.0,,,,My reward for finally finishing my bachelor’s ...,0.98,https://i.redd.it/qs3elzcr86q81.jpg
1,nguit98,,,1708648000.0,,,1axmm77,,,,...,,/r/BMWi3/comments/1axmm77/100k_miles/,,200.0,I finally passed the 100k mile mark on my 2015...,,,100k miles,0.98,https://i.redd.it/z4v99nrfb8kc1.jpeg
2,labdweller,i3 BEV,,1678789000.0,,,11r2bh2,,,,...,,/r/BMWi3/comments/11r2bh2/rabbit_stowage_area_...,,185.0,,,,Rabbit stowage area on the i3,0.99,https://i.redd.it/yewt745fjona1.jpg
3,toteratte21,,,1685004000.0,,,13rc663,,,,...,,/r/BMWi3/comments/13rc663/picked_her_up_straig...,,174.0,"2020 i3s, dark brown leather and light wood.\n...",,,Picked her up straight from the source in Munich,0.99,https://i.imgur.com/7jTu2dR.jpg
4,azscram9,,,1684690000.0,,,13o1009,,,,...,,/r/BMWi3/comments/13o1009/picked_up_this_littl...,,167.0,First time owner and first car I’ve owned in 1...,,,Picked up this little gem over the weekend,0.96,https://i.redd.it/w4ilz2ygh91b1.jpg


In [3]:
comments_df.head()

Unnamed: 0,body,body_html,created_utc,distinguished,edited,id,is_submitter,link_id,parent_id,permalink,saved,score,stickied,subreddit_id
0,Congrats on both! Glad I'm not the only who w...,"<div class=""md""><p>Congrats on both! Glad I&#...",1648494000.0,,,i2h01oy,,t3_tqghan,t3_tqghan,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,14,,t5_2y1ho
1,Congrats! That is one impressive reward! This ...,"<div class=""md""><p>Congrats! That is one impre...",1648494000.0,,,i2h052j,,t3_tqghan,t3_tqghan,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,9,,t5_2y1ho
2,Congratulations!!! It’s beautiful !!!,"<div class=""md""><p>Congratulations!!! It’s bea...",1648504000.0,,,i2hnwdm,,t3_tqghan,t3_tqghan,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,4,,t5_2y1ho
3,Congratulations! That's a fantastic car for a ...,"<div class=""md""><p>Congratulations! That&#39;s...",1648504000.0,,,i2hmqoo,,t3_tqghan,t3_tqghan,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,3,,t5_2y1ho
4,Welcome to the family! Congratulations.,"<div class=""md""><p>Welcome to the family! Con...",1648509000.0,,,i2hy1od,,t3_tqghan,t3_tqghan,/r/BMWi3/comments/tqghan/my_reward_for_finally...,,3,,t5_2y1ho


# Check vocab size

In [4]:
def count_unique_words(df):
    # Combine all rows into a single list
    corpus = ' '.join(df.tolist())
    # Split/tokenize
    tokens = corpus.split()
    # Lowercase all words
    tokens = [word.lower() for word in tokens]
    # Remove duplicate values
    unique_tokens = set(tokens)
    return len(unique_tokens)

def count_total_words(df):
    # Combine all rows into a single list
    corpus = ' '.join(df.tolist())
    # Split/tokenize
    tokens = corpus.split()
    return len(tokens)

In [5]:
print(f"Number of total words: {count_total_words(posts_df['title'])}")
print(f"Number of unique words: {count_unique_words(posts_df['title'])}")

Number of total words: 15147
Number of unique words: 4722


In [7]:
print(f"Number of total words: {count_total_words(comments_df['body'])}")
print(f"Number of unique words: {count_unique_words(comments_df['body'])}")

Number of total words: 1601745
Number of unique words: 92203


# Basic data cleaning before indexing

## Domain specific abbreviations for Reddit and general internet slangs
https://www.reddit.com/r/TheoryOfReddit/wiki/glossary/

In [ ]:
abbr_mapper = {
    # Reddit abbreviations & Slangs
    'Alt': 'Alternative Reddit account',
    'AMA': 'Ask me anything',
    'AMAA': 'Ask me almost anything',
    'Benned': 'Banned',
    'Brony': 'Male fan of My Little Pony',
    'Cakeday': 'Birthday',
    'Circlejerk': 'Elitist group',
    'DAE': 'Does anyone else',
    'Ent': 'Pot smoker',
    'ETA': 'Edited to add',
    'F7U12': 'FU',
    'Fap': 'Masturbate',
    '[FIXED]': 'Remix of an original post',
    'FTA': 'From the article',
    'FTFY': 'Fixed That For You',
    'GW': 'Gone wild',
    'Hivemind': 'Collective',
    'IAMA': 'I Am A',
    'IMO': 'In My Opinion',
    'IMHO': 'In my honest opinion',
    'IIRC': 'If i recall correctly',
    'ITT': 'In this thread',
    'Karma': 'Reddit score',
    'Karmawhore': 'Desperate for reddit points',
    'Meta-sub': 'Subreddits talking about Reddit',
    'Meta-subreddits': 'Subreddits talking about Reddit',
    'MIC': 'More in comments',
    'Mod': 'Moderator',
    'MRA': 'Mens rights activist',
    'Neckbeard': 'Dirty reddit user',
    'Ninjaedit': 'sneaky edit',
    'Novelty account': 'joke account',
    'NSFW': 'Not safe for work',
    'NSFL': 'Not safe for life',
    'OP': 'Original Poster',
    'Orangered': 'Unread messages',
    'Power user': 'User with high reddit score',
    'Pun thread': 'Chain of punny comments',
    'Reddiquette': 'Rules of reddit',
    'RES': 'Reddit enhancement suite',
    'RTFA': 'Read the fucking article',
    'Shadow-ban': 'Silent ban',
    'Shitpost': 'Trash post',
    'Sockpuppet': 'Alternate reddit account',
    'SJW': 'Social Justice Warrior',
    'SRD': 'Subreddit drama',
    'SRS': 'Shit reddit says',
    'Sub': 'Subreddit',
    'TIL': 'Today I learned',
    'TL;DR': 'Too Long Didnt read',
    'TLDR': 'Too Long Didnt read',
    'WIP': 'Work in progress',
    'X-post': 'Crosspost',
    'Xpost': 'Crosspost',
}

# Regex based
import re

pattern = r'wh[o]+sh'
text = "whoosh whooooosh"

matches = re.findall(pattern, text)
print(matches)  # Output: ['whoosh', 'whooooosh']

# Electric car domain specific abbreviations & slangs