In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string

In [2]:
dataframes = {
    "cooking": pd.read_csv("../../input/cooking.csv"),
    "crypto": pd.read_csv("../../input/crypto.csv"),
    "robotics": pd.read_csv("../../input/robotics.csv"),
    "biology": pd.read_csv("../../input/biology.csv"),
    "travel": pd.read_csv("../../input/travel.csv"),
    "diy": pd.read_csv("../../input/diy.csv"),
    "physics": pd.read_csv("../../input/test.csv")
}

## Removing html tags and uris from contents

In [3]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [4]:
# This could take a while
for df in dataframes.values():
    df["content"] = df["content"].map(stripTagsAndUris)

## Removing puctuation from titles and contents

In [5]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [6]:
for df in dataframes.values():
    df["title"] = df["title"].map(removePunctuation)
    df["content"] = df["content"].map(removePunctuation)

## Removing stopwords from titles and contents

In [7]:
stops = set(stopwords.words("english"))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

In [8]:
for df in dataframes.values():
    df["title"] = df["title"].map(removeStopwords)
    df["content"] = df["content"].map(removeStopwords)

## Splitting tags string in a list of tags

In [11]:
for name,df in dataframes.items():
    # From a string sequence of tags to a list of tags
    if name !='physics':
        df["tags"] = df["tags"].map(lambda x: x.split())

0        [ribosome, binding-sites, translation, synthet...
1                                      [rna, biochemistry]
2                   [immunology, cell-biology, hematology]
3                                           [cell-culture]
4            [splicing, mrna, spliceosome, introns, exons]
5                   [dna, biochemistry, molecular-biology]
6                                 [neuroscience, synapses]
7                                               [plasmids]
8        [molecular-genetics, gene-expression, experime...
9                  [evolution, mitochondria, chloroplasts]
10                           [high-throughput, cell-based]
11                  [molecular-biology, synthetic-biology]
12                              [bioinformatics, homework]
13                              [neuroscience, immunology]
14                                     [splicing, histone]
15                      [genomics, gene-annotation, exons]
16                     [microbiology, virology, influenz

## Saving preprocessed dataframes to csv

In [15]:
for name, df in dataframes.items():
    # Saving to file
    df.to_csv("../../input_light/"+name + "_light.csv", index=False)