In [None]:
import pandas as pd
import numpy as np
import os
import re
from nltk import word_tokenize,sent_tokenize,pos_tag_sents,pos_tag
import time

## 1. Read data.

In [None]:
DATADIR=os.getenv('DATADIR')
DATAPATH = os.path.join(DATADIR, 'labelled_level2.csv.gz')

In [None]:
data = pd.read_csv(DATAPATH,compression='gzip')
data.shape

In [None]:
data.head()

### 1.1 UTILS.

In [None]:
data.columns

In [None]:
def build_index(x):
    index_dict = {}
    index_dict['index'] = 0
    for i,elem in enumerate(x):
        index_dict[elem] = i+1
    return index_dict

In [None]:
ind = build_index(data.columns)
print(ind['combined_text'])

## 2. Filter target Part of Speech tags and punctuation.


In [None]:
def is_junk(word):
    return len(word) < 2

def is_aux(word):
    m = re.match("\\b[iI]s\\b|\\b[aA](m|re)\\b|\\b[bB](een|e)\\b|\\b[hH](ave|as)\\b|\\b[wW](as|ere|ill|ould)\\b",word)
    return bool(m)

In [None]:
test = "been"
print(is_aux(test))
test2 = ""
print(is_junk(test2))

In [None]:
def reduce_text(x):
    keep_text =""
    sent_pos = pos_tag(word_tokenize(x))
    for word,tag in sent_pos:
        if not is_junk(word) and (tag.startswith("NN") or (tag.startswith("VB") and 
                                                           not is_aux(word)) or tag.startswith("JJ")):
            keep_text += word + " "
    return keep_text.strip()

### 2.1 Filter all text fields.

In [None]:
data['reduced_title'] = data['title'].map(reduce_text)

In [None]:
data['reduced_desc'] = data['description'].map(reduce_text)

In [None]:
data['reduced_body'] = data['body'].map(reduce_text)

In [None]:
data['reduced_combined'] = data['combined_text'].map(reduce_text)

### 2.2 Rename and drop unfiltered columns.

In [None]:
data.columns

In [None]:
data['combined_text'] = data['reduced_combined'] 
data['title'] = data['reduced_title'] 
data['body'] = data['reduced_body']
data['description'] = data['reduced_desc'] 

In [None]:
data.drop(['reduced_combined','reduced_title', 'reduced_body','reduced_desc'],axis=1,inplace=True)

In [None]:
data.head()

## 3. Write to file.

In [None]:
file = 'labelled_level2_filtered_all_beta.csv.gz'
OUTPUT= os.path.join(DATADIR, file)
data.to_csv(OUTPUT,compression='gzip')