In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import re
import spacy as sp
import textblob as tb
import time

In [None]:
!pip install textblob

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

# Joke Scoring System:

## 1. Data acquisition

In [32]:
million_joke_df = pd.read_csv("hf://datasets/SocialGrep/one-million-reddit-jokes/one-million-reddit-jokes.csv")

## 2. Data preprocessing: 

In [33]:
million_joke_df.columns

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score'],
      dtype='object')

In [34]:
million_joke_df['joke'] = million_joke_df.title + million_joke_df.selftext

In [35]:
million_joke_df.drop(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title'], axis=1, inplace=True)

In [53]:
million_joke_df.sample(5)

Unnamed: 0,score,joke
277828,1,my job sucks. i even tried calling my boss via...
52540,1,did you hear about the new perfume for people ...
511491,233,vodka isn't a liquid.\nit’s a solution.\n
8103,63,buying cat treats at walmart todaytryoue story...
661876,7,there was a groundbreaking surgery performed h...


In [54]:
million_joke_df = million_joke_df.dropna(subset=['joke'])

In [38]:
million_joke_df = million_joke_df.drop_duplicates(subset=['joke'], keep='first')

In [39]:
slang_dict = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laughing My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

million_joke_df['joke'] = million_joke_df['joke'].replace(slang_dict, regex=True)


In [40]:
million_joke_df['joke'] = million_joke_df['joke'].str.lower()

In [42]:
def convert_emojis_and_remove_colons(text):
    text_with_emojis = emoji.demojize(text)
    return re.sub(r':(\w+):', r'\1', text_with_emojis)

In [43]:
million_joke_df['joke'] = million_joke_df['joke'].apply(convert_emojis_and_remove_colons)

In [None]:
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)
million_joke_df['joke'] = million_joke_df['joke'].apply(correct_spelling)

In [69]:
nlp = sp.load("en_core_web_sm")
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [74]:
tokenized_jokes = []
sz = len(million_joke_df)

for i in range(sz):
    joke = million_joke_df.iloc[i]['joke']  # Use .iloc to access the row by position
    tokens = tokenize_text(joke)           
    tokenized_jokes.append(tokens)         
    
    if i % 10000 == 0:
        print(f"Progress: {((i+1)/sz)*100:.2f}%")

million_joke_df['tokens'] = tokenized_jokes

Progress: 0.00%
Progress: 1.07%
Progress: 2.13%
Progress: 3.20%
Progress: 4.26%
Progress: 5.33%
Progress: 6.39%
Progress: 7.46%
Progress: 8.53%
Progress: 9.59%
Progress: 10.66%
Progress: 11.72%
Progress: 12.79%
Progress: 13.85%
Progress: 14.92%
Progress: 15.99%
Progress: 17.05%
Progress: 18.12%
Progress: 19.18%
Progress: 20.25%
Progress: 21.31%
Progress: 22.38%
Progress: 23.44%
Progress: 24.51%
Progress: 25.58%
Progress: 26.64%
Progress: 27.71%
Progress: 28.77%
Progress: 29.84%
Progress: 30.90%
Progress: 31.97%
Progress: 33.04%
Progress: 34.10%
Progress: 35.17%
Progress: 36.23%
Progress: 37.30%
Progress: 38.36%
Progress: 39.43%
Progress: 40.50%
Progress: 41.56%
Progress: 42.63%
Progress: 43.69%
Progress: 44.76%
Progress: 45.82%
Progress: 46.89%
Progress: 47.95%
Progress: 49.02%
Progress: 50.09%
Progress: 51.15%
Progress: 52.22%
Progress: 53.28%
Progress: 54.35%
Progress: 55.41%
Progress: 56.48%
Progress: 57.55%
Progress: 58.61%
Progress: 59.68%
Progress: 60.74%
Progress: 61.81%
Progres

In [75]:
million_joke_df.sample(5)

Unnamed: 0,score,joke,tokens
345332,10,if money doesn’t grow on trees..then why do ba...,"[if, money, does, n’t, grow, on, trees, .., th..."
714785,1,why is 6 afraid of seven?[removed],"[why, is, 6, afraid, of, seven?[removed, ]]"
747849,1,youpdate of an old one[removed],"[youpdate, of, an, old, one[removed, ]]"
307101,0,"a rapist, a bigot, and a pathological liar wal...","[a, rapist, ,, a, bigot, ,, and, a, pathologic..."
367415,5,i hate all this sex on the television[deleted],"[i, hate, all, this, sex, on, the, television[..."


In [76]:
million_joke_df.to_csv('million_jokes.csv', index=False)