In [1]:
import pandas as pd

In [3]:
posts_file = "data/rspct_autos.tsv"
posts_df = pd.read_csv(posts_file, sep="\t")

subred_file = "data/subreddit_info.csv.gz"
subred_df = pd.read_csv(subred_file).set_index(["subreddit"])

df = posts_df.join(subred_df, on="subreddit")

### Blueprint: Standardizing Attribute Names

In [9]:
df.columns

Index(['id', 'subreddit', 'title', 'selftext', 'category_1', 'category_2',
       'category_3', 'in_data', 'reason_for_exclusion'],
      dtype='object')

In [10]:
column_mapping = {
    "id": "id",
    "subreddit": "subreddit",
    "title": "title",
    "selftext": "text",
    "category_1": "category",
    "category_2": "subcategory",
    "category_3": None,  # No data.
    "in_data": None,  # Not needed.
    "reason_for_exclusion": None,  # Not needed.
}

# Define remaining columns.
columns = [c for c in column_mapping.keys() if column_mapping[c] != None]

# Select and rename those columns.
df = df[columns].rename(columns=column_mapping)

# Limit the data to autos category.
df = df[df["category"] == "autos"]
df.sample(1).T

Unnamed: 0,8020
id,63wxeo
subreddit,mazda3
title,Blew my motor :(
text,my 08 hatch spun a rod bearing the other day a...
category,autos
subcategory,mazda


**Saving and loading a Dataframe**

Storing in SQL has more advantages over pickle.

In [12]:
# df.to_pickle('reddit_dataframe.pkl')

import sqlite3

db_name = "reddit-selfposts.db"
con = sqlite3.connect(db_name)
df.to_sql("posts", con, index=False, if_exists="replace")
con.close()

In [13]:
con = sqlite3.connect(db_name)
df = pd.read_sql("select * from posts", con)
con.close()

## Cleaning Text Data

### Blueprint: Identify Noise with Regular Expressions

In [20]:
import re

RE_SUSPICIOUS = re.compile(r"[&#<>{}\[\]\\]")


def impurity(text, min_len=10):
    """Returns the share of suspicious characters in a text."""
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text)) / len(text)


text = """
After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)     
it got me thinking about the best match ups.     
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)     
Captain America<lb>"""

impurity(text)

0.08438818565400844

In [21]:
# Add new column to data frame.
df["impurity"] = df["text"].apply(impurity, min_len=10)

# Get the top 3 records.
df[["text", "impurity"]].sort_values(by="impurity", ascending=False)

Unnamed: 0,text,impurity
19682,Looking at buying a 335i with 39k miles and 11...,0.214716
12357,I'm looking to lease an a4 premium plus automa...,0.165099
2730,Breakdown below:<lb><lb>Elantra GT<lb><lb>2.0L...,0.139130
12754,Bulbs Needed:<lb><lb><lb>**194 LED BULB x8**<l...,0.132411
10726,I currently have a deposit on a 2013 335is (CP...,0.129317
...,...,...
15867,All the wash places around me are very expensi...,0.000000
8311,I've recently been having some issues with my ...,0.000000
15865,My 2006 9-3 has gone into limp home mode and I...,0.000000
15864,I'm currently looking at cars (online) and I c...,0.000000


In [22]:
%run exploration.py

In [25]:
count_words(df, column="text", preprocess=lambda t: re.findall(r"<[\w/]*>", t))

Unnamed: 0_level_0,freq
token,Unnamed: 1_level_1
<lb>,100729
<tab>,642


### Blueprint: Removing Noise with Regular Expressions

In [26]:
import html


def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)

    # tags like <tab>
    text = re.sub(r"<[^<>]*>", " ", text)

    # markdown URLs like [Some text](https://....)
    text = re.sub(r"\[([^\[\]]*)\]\([^\(\)]*\)", r"\1", text)

    # text or code in brackets like [0]
    text = re.sub(r"\[[^\[\]]*\]", " ", text)

    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r"(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)", " ", text)

    # standalone sequences of hyphens like --- or ==
    text = re.sub(r"(?:^|\s)[\-=\+]{2,}(?:\s|$)", " ", text)

    # sequences of white spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()

In [27]:
clean_text = clean(text)
clean_text

"After viewing the PINKIEPOOL Trailer it got me thinking about the best match ups. Here's my take: Deadpool Captain America"

In [29]:
print("Impurity:", impurity(clean_text))

Impurity: 0.0


In [31]:
df["clean_text"] = df["text"].map(clean)
df["impurity"] = df["clean_text"].apply(impurity, min_len=20)
df[["clean_text", "impurity"]].sort_values(by="impurity", ascending=False).head(3)

Unnamed: 0,clean_text,impurity
14058,"Mustang 2018, 2019, or 2020? Must Haves!! 1. H...",0.030864
18934,"At the dealership, they offered an option for ...",0.026455
16505,"I am looking at four Caymans, all are in a sim...",0.024631


### Blueprint: Character Normalization with textacy

In [32]:
import textacy.preprocessing as tprep

In [37]:
def normalize(text):
    text = tprep.normalize.hyphenated_words(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.normalize.unicode(text)
    text = tprep.remove.accents(text)
    return text

In [38]:
text = "The café “Saint-Raphaël” is loca-\nted on Côte dʼAzur."
normalize(text)

'The cafe "Saint-Raphael" is located on Cote d\'Azur.'

### Blueprint: Pattern-Based Data Masking with textacy

In [41]:
from textacy.preprocessing.resources import RE_URL

In [43]:
count_words(df, column="clean_text", preprocess=RE_URL.findall).head(3)

Unnamed: 0_level_0,freq
token,Unnamed: 1_level_1
www.getlowered.com,3
http://www.ecolamautomotive.com/#!2/kv7fq,2
https://www.reddit.com/r/Jeep/comments/4ux232/just_ordered_an_android_head_unit_joying_jeep/,2


In [49]:
from textacy.preprocessing.replace import urls as replace_urls

text = "Check out https://spacy.io/usage/spacy-101"
print(replace_urls(text))

Check out _URL_


In [50]:
df["clean_text"] = df["clean_text"].map(replace_urls)
df["clean_text"] = df["clean_text"].map(normalize)

In [51]:
df.rename(columns={"text": "raw_text", "clean_text": "text"}, inplace=True)
df.drop(columns=["impurity"], inplace=True)

con = sqlite3.connect(db_name)
df.to_sql("posts_cleaned", con, index=False, if_exists="replace")
con.close()

## Tokenization