# Create Out-of-domain File

In [1]:
import re
from typing import List

## Importing subreddit files

In [2]:
def read_txt(filepath: str) -> List[str]:
    f = open(filepath, "r")
    text = f.readlines()
    f.close()
    return text
    
def write_txt(filepath: str, text: List[str]) -> None:
    f = open(filepath, "w")
    for line in text: f.write(line + "\n")
    f.close()
    

In [3]:
def clean_subreddit_file(domain: str, filename: str, regex: str) -> None:
    # reading file into list
    raw_filepath = "./text-classifier/{}-domain/raw/{}.txt".format(domain, filename)
    clean_filepath = "./text-classifier/{}-domain/clean/{}.txt".format(domain, filename)
    
    text = read_txt(raw_filepath)
    
    # deleting duplicates
    text = list(set(text))
    
    # regex cleaning
    text = [re.sub(regex, "", line) for line in text]
    
    # removing whitespace
    text = [line.strip() for line in text]
    
    # writing list to file
    write_txt(clean_filepath, text)
    

In [4]:
def any_case_regex(word: str) -> str:
    regex = r""
    
    for letter in word:
        lower = letter.lower()
        upper = letter.upper()
        regex += "[{}{}]".format(upper, lower)
        
    return regex

RE_OPEN_BRACKET = r"[\[\(]?"
RE_CLOSE_BRACKET = r"[\]\)]?"
RE_UNTIL_ALPHANUM = r"([\W]+)"
RE_SECOND_WORD = r"([\w]+)([\W]+)"
RE_END_QUESTION_MARK = r"|\?$"

RE_REMOVE_QUESTIONS = r"^({}|{}|{}|{}|{}|{}|{}|{})".format(
    any_case_regex("who"),
    any_case_regex("what"),
    any_case_regex("where"),
    any_case_regex("which"),
    any_case_regex("how"),
    any_case_regex("when"),
    any_case_regex("why"),
    any_case_regex("whose"),
)

RE_SQUARE_BRACKETS = r"\[.+\]"
RE_ROUND_BRACKETS = r"\(.+\)"

RE_SERIOUS = any_case_regex("serious")
RE_NSFW = any_case_regex("nsfw")
RE_QUESTIONS = RE_OPEN_BRACKET + RE_REMOVE_QUESTIONS + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM + RE_SECOND_WORD + RE_END_QUESTION_MARK
RE_ELI5 = RE_OPEN_BRACKET + any_case_regex("eli5") + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM
RE_TIFU = RE_OPEN_BRACKET + any_case_regex("tifu") + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM + "(" + any_case_regex("by") + ")" "?"
RE_LPT = RE_OPEN_BRACKET + any_case_regex("lpt") + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM
RE_TIL = RE_OPEN_BRACKET + any_case_regex("til") + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM
RE_YSK = RE_OPEN_BRACKET + any_case_regex("ysk") + RE_CLOSE_BRACKET + RE_UNTIL_ALPHANUM

In [5]:
out_domain = [
    ("AskReddit", RE_SQUARE_BRACKETS + "|" + RE_QUESTIONS),
    ("CasualConversation", RE_SQUARE_BRACKETS),
    ("CrazyIdeas", RE_SQUARE_BRACKETS),
    ("explainlikeimfive", RE_SQUARE_BRACKETS + "|" + RE_ELI5),
    ("Jokes", RE_SQUARE_BRACKETS),
    ("lifehacks", RE_SQUARE_BRACKETS),
    ("LifeProTips", RE_SQUARE_BRACKETS + "|" + RE_LPT),
    ("mildlyinteresting", RE_SQUARE_BRACKETS),
    ("news", RE_SQUARE_BRACKETS),
    ("nosleep", RE_SQUARE_BRACKETS),
    ("pettyrevenge", RE_SQUARE_BRACKETS),
    ("ShowerThoughts", RE_SQUARE_BRACKETS),
    ("tifu", RE_SQUARE_BRACKETS + "|" + RE_TIFU),
    ("todayilearned", RE_SQUARE_BRACKETS + "|" + RE_TIL),
    ("worldnews", RE_SQUARE_BRACKETS),
    ("YouShouldKnow", RE_SQUARE_BRACKETS + "|" + RE_YSK),
]


for filename, regex in out_domain:
    clean_subreddit_file("out", filename, regex)

In [11]:
def combine_files(domain, subreddits: List[str]) -> None:
    combined = []
    
    for subreddit in subreddits:
        path = "./text-classifier/{}-domain/clean/{}.txt".format(domain, subreddit)
        text = read_txt(path)
        combined.extend(text)
        
    f = open("./text-classifier/{0}-domain/{0}-domain.txt".format(domain), "w+")
    for line in combined: f.write(line)
    f.close()

In [12]:
combine_files("out", [x[0] for x in out_domain])

# Create In-domain File

In [13]:
in_domain = [
    ("Food", RE_SQUARE_BRACKETS),
    ("FoodPorn", RE_SQUARE_BRACKETS),
]


for filename, regex in in_domain:
    clean_subreddit_file("in", filename, regex)

In [14]:
combine_files("in", [x[0] for x in in_domain])