In [4]:
import pandas as pd
import numpy as np
import string
import re


import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [5]:
def load_data(filename):
    df = pd.read_csv(filename)
    DATA = pd.DataFrame(columns=['article','tags'])
    DATA['article'] = df['article']
    DATA['tags'] = df['tags']
    return DATA

In [6]:
filename = './new_dataset/train.csv'
train_data = load_data(filename)

In [7]:
train_X = train_data['article']
train_Y = train_data['tags']

<h1><i>Article Preprocess</i></h1>

In [8]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)


def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = replace_contractions(text)
    return text

In [9]:
# train_X = train_X.apply(denoise_text)

In [10]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
#     words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [11]:
def tokenize_and_normalize(text):
    words = nltk.word_tokenize(text)
    words = normalize(words)
    return " ".join(words)

<i>Uncomment following if needed to preprocess from start(estimate time 6hrs)

In [12]:
# train_X = train_X.apply(tokenize_and_normalize)

In [13]:
# import os
# Saving_train = pd.DataFrame(columns=['article'])
# Saving_train['article'] = train_X
# path = r'C:\Users\adilf\Deeplearning Projects\hackerearth#4\process-dataset'
# Saving_train.to_csv(os.path.join(path,r'train_X.csv'))

In [14]:
def clean_text(text):
    text = re.sub(r'(\s\d+\s)',' ',text)
    text = re.sub(r'[0-9]*','',text)
    
    text = text.split()
    text = " ".join([w for w in text if len(w) >= 2])
    
    return text

In [15]:
# train_X = train_X.apply(clean_text)

In [16]:
# Saving_train = pd.DataFrame(columns=['article'])
# Saving_train['article'] = train_X
# path = r'C:\Users\adilf\Deeplearning Projects\hackerearth#4\process-dataset'
# Saving_train.to_csv(os.path.join(path,r'train_XClean.csv'))

<i>Load preprocessed data

In [17]:
# read processed data
data_X = pd.read_csv('process-dataset/train_XClean.csv',dtype=object)
data_Y = pd.read_csv('process-dataset/train_Y.csv',dtype=object)
data_tags = pd.read_csv('process-dataset/tags.csv',dtype=object)

In [18]:
# read X and Y 
train_X = data_X['article']
train_Y = data_Y['tags']
tags = data_tags['tags']

In [19]:
train_X = train_X.astype(str)
train_Y = train_Y.astype(str)
tags = tags.astype(str)

<h1><i>Tags Preprocess</i></h1>

In [20]:
train_Y = train_Y.astype(str)

In [21]:
def process_labels(text):
    text = text.replace("|"," ")
    return text

In [22]:
def process_tags(labels):
    tags = []
    for label in labels.values:
        tag = label.split()
        tags.extend(tag)
    tags_set = list(set(tags))
    tags_df = pd.DataFrame(tags_set,columns=['tags'])
    
    return tags_df

<i>Uncomment to process tags and save 'em

In [23]:
# train_Y = train_Y.apply(process_labels)

In [24]:
# Saving_train = pd.DataFrame(columns=['tags'])
# Saving_train = tags
# path = r'C:\Users\adilf\Deeplearning Projects\hackerearth#4\process-dataset'
# Saving_train.to_csv(os.path.join(path,r'tags.csv'))

<i>Preprocess tags

In [32]:
def process_tags(tag):
    text = re.sub('\-','',tag)
    text = re.sub('\.','',text)
    return text

In [33]:
temp = tags.apply(process_tags)

In [37]:
tags.head(20)

0                        tombstone
1                    cocos2d-x-2.x
2                         dart-rpc
3                      dart-editor
4                        wikimedia
5     terraform-provider-openstack
6                          ui-sref
7                     ubuntu-cloud
8          react-native-scrollview
9               nhibernate-mapping
10                 boost-date-time
11                   stacked-chart
12                          ebcdic
13                    user-defined
14                  yii2-extension
15       personal-software-process
16               python-decorators
17                   viewrendering
18                             zos
19                       escodegen
Name: tags, dtype: object

In [36]:
temp.head(20)

0                      tombstone
1                     cocos2dx2x
2                        dartrpc
3                     darteditor
4                      wikimedia
5     terraformprovideropenstack
6                         uisref
7                    ubuntucloud
8          reactnativescrollview
9              nhibernatemapping
10                 boostdatetime
11                  stackedchart
12                        ebcdic
13                   userdefined
14                 yii2extension
15       personalsoftwareprocess
16              pythondecorators
17                 viewrendering
18                           zos
19                     escodegen
Name: tags, dtype: object