In [None]:
def clean(train_context):
    
    curr_text_cols = [
         "topic_title", 
         "topic_description", 
         "content_title", 
         "content_description", 
         "content_text"
    ]
    
    context_text_cols = [
        "topic_parent_title", 
        "topic_parent_description", 
        "topic_child_title", 
        "topic_child_description"
    ]

    train_context = deepcopy(train_context)
    
    # Replace uninformative/duplicate titles/descriptions with nothing.
    
    def useless_desc(x):
        # remove 1 letter descriptions, and improve descriptions with links
            words = re.sub(r"source_url=", "", x)
            words = x.split()
            match = re.search(r"http[s]?://(www\.)?([\w-]+).org/([\w-]+)/([\w-]+)/([\w-]+)\.(mp4)", words)
            unwanted = ["Additional resources and links.", "Test your knowledge.","Entire lesson", "Teacher's Guide English version", "Test your knowledge!", "Written Transcript of this video lesson", "Read Teacher's Guide to this video lesson", "Written Transcript of this video lesson in English", "Written Transcript of this video lesson in Arabic", "Teacher's Guide Arabic version", "Abonnez vous pour recevoir nos nouveaux cours GRATUITS en HD", "Download Written Transcript of this video lesson"]
            if match:
                return match.group(2)+" "+match.group(4)+" "+match.group(5)
            
            elif (len(words) <= 1) or words in unwanted:
                return ''
            return x
        
    for col in (curr_text_cols + context_text_cols):
        if "description" in col:
                train_context[col] = train_context[col].apply(useless_desc)
    
    # def duplicate_titles(x):
    #     for col in (curr_text_cols + context_text_cols):
    #         if "title" in col:
    
    # Remove stopwords for all languages.
    from nltk.tokenize import word_tokenize
    
    stop_words_en = set(stopwords.words('english'))
    stop_words_es = set(stopwords.words('spanish'))
    stop_words_fr = set(stopwords.words('french'))
    stop_words_ar = set(stopwords.words('arabic'))
    stop_words_pt = set(stopwords.words('portuguese'))

    stop_words_all = set.union(stop_words_en, stop_words_es, stop_words_fr, stop_words_ar, stop_words_pt)

    def remove_stopwords(x):
        words = word_tokenize(x)
        x = [word for word in words if word.lower() not in stop_words_all]
        return x
    
    # Handle "source_id=" in topic descriptions.
    def replace_sourceid(x):
        if x is not np.nan:
            if "source_id" in x:
                x = ""
        return x
    
    for col in (curr_text_cols + context_text_cols):
        if "description" in col:
            train_context[col] = train_context[col].apply(replace_sourceid)
    
    # Handle literal gibberish: '5ad59f7a6b9064043e263f03' and weird links (YouTube link).
    def remove_gibberish(x):
        return re.sub(r'\b\w{20,}\b', '', x)
    
    def link_shortener(x):
        return re.sub(r'https?:\/\/(www\.)?([a-zA-Z0-9]+\.[a-zA-Z]{2,3})(\/\S*)?', r'\2', x)
    
    # Handle Topic, Section, Chapter text.
    
    # Remove numbers.
    def remove_numbers(x):
        if x is np.nan:
            return x
        return re.sub(" \d+", "", x)
    
    # Remove special characters.
    def remove_chars(x, include_brackets=False):
        if x is np.nan:
            return x
        if include_brackets:
            return x.translate(str.maketrans('', '', string.punctuation.replace("[", "").replace("]", "")))
        return x.translate(str.maketrans('', '', string.punctuation))
    
    print("Removing special characters & numbers...")
    for col in tqdm(curr_text_cols, position=0, leave=True, total=len(curr_text_cols)):
        train_context[col] = train_context[col].apply(remove_chars)
        train_context[col] = train_context[col].apply(remove_numbers)
        
    for col in tqdm(context_text_cols, position=0, leave=True, total=len(context_text_cols)):
        train_context[col] = train_context[col].apply(remove_chars, include_brackets=True)
        train_context[col] = train_context[col].apply(remove_numbers)

    

    print("Finished")
    
    # Remove leading or trailing whitespace.
    
    
    return train_context