# Text Processing steps
- Strip tags
- Strip punctuations
- Strip multiple whitespaces
- Strip numeric
- Spell correction
- Remove Stopwords
- Strip words with lenght < 3
- Strip words with length > 25
- Lemmitization over stemming
- Stack Overflow similar word injection
- Github similar word injection

In [118]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import pandas as pd
from gensim import utils
from gensim.parsing.preprocessing import preprocess_string, strip_tags, \
strip_punctuation, strip_multiple_whitespaces, strip_numeric, \
remove_stopwords, strip_short
import nltk
from nltk.stem import WordNetLemmatizer
import progressbar
import os

In [120]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Paths

### input data paths

In [0]:
train_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/train_data/raw/combined.csv"
test_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/test_data/raw/test.csv"
validation_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/validation_data/raw/validation.csv"

### Output data paths

In [0]:
processed_train_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/train_data/processed/combined.csv"
processed_test_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/test_data/processed/test.csv"
processed_validation_data_path = "/content/drive/My Drive/documents/projects/DeCaf/data/new/validation_data/processed/validation.csv"

## Read data

In [0]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
validation_data = pd.read_csv(validation_data_path)

## Text processing pipeline

In [0]:
def lemmitizer(s):
  wnl = WordNetLemmatizer()
  return " ".join(wnl.lemmatize(word) for word in s.split())

In [0]:
def strip_long(s, maxsize=25):
  s = utils.to_unicode(s)
  return " ".join(e for e in s.split() if len(e) <= maxsize)

In [0]:
def process(series):
  pbar = progressbar.ProgressBar(max_value=series.shape[0])
  custom_filters = [
    lambda x: strip_long(x),
    lambda x: lemmitizer(x),
    strip_tags,
    strip_punctuation,
    strip_multiple_whitespaces,
    strip_numeric,
    remove_stopwords,
    strip_short    
  ]
  for index, sentence in series.items():
    series.iloc[index] = " ".join(preprocess_string(sentence, custom_filters))
    pbar.update(index)
  return series

## Process train data

In [127]:
processed_series = process(train_data.text)
processed_train_data = {'text': processed_series, 'tags': train_data.tags, 'label': train_data.label}
processed_train_data = pd.DataFrame(processed_train_data)

 99% (199996 of 200000) |############### | Elapsed Time: 0:11:15 ETA:   0:00:00

In [0]:
if not os.path.exists("/content/drive/My Drive/documents/projects/DeCaf/data/new/train_data/processed/combined.csv"):
  processed_train_data.to_csv("/content/drive/My Drive/documents/projects/DeCaf/data/new/train_data/processed/combined.csv", index=False)

## Process test data

In [132]:
processed_series = process(test_data.text)
processed_test_data = {'text': processed_series, 'tags': test_data.tags, 'label': test_data.label}
processed_test_data = pd.DataFrame(processed_test_data)

 99% (29900 of 30000) |################# | Elapsed Time: 0:00:25 ETA:   0:00:00

In [0]:
if not os.path.exists("/content/drive/My Drive/documents/projects/DeCaf/data/new/test_data/processed/test_data.csv"):
  processed_test_data.to_csv("/content/drive/My Drive/documents/projects/DeCaf/data/new/test_data/processed/test_data.csv", index=False)

## Process validation data

In [134]:
processed_series = process(validation_data.text)
processed_validation_data = {'text': processed_series, 'tags': validation_data.tags, 'label': validation_data.label}
processed_validation_data = pd.DataFrame(processed_validation_data)

 99% (29997 of 30000) |################# | Elapsed Time: 0:00:24 ETA:   0:00:00

In [0]:
if not os.path.exists("/content/drive/My Drive/documents/projects/DeCaf/data/new/validation_data/processed/validation_data.csv"):
  processed_validation_data.to_csv("/content/drive/My Drive/documents/projects/DeCaf/data/new/validation_data/processed/validation_data.csv", index=False)