In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import warnings

In [2]:
def text_to_words(raw_text):
    """
    Function to convert a raw text to a string of words
    The input is a single string (a raw text), and 
    the output is a single string (a preprocessed text)
    """
    text = BeautifulSoup(raw_text,"html5lib").get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    meaningful_words = " ".join(meaningful_words)
    word_list = TextBlob(meaningful_words)
    lemma_words = [Word(w.lemmatize()) for w in word_list.words]
    lemma_words = [w.lemmatize('v') for w in lemma_words]
    return( " ".join(lemma_words))


def cleaning(data, column):
    """
    Preprocessing certain column from a dataset
    """
    if isinstance(column, list):
        for c in column:
            print("Cleaning %s..."%c)
            clean_column = []
            for i in range(0, len(data)):
                clean_column.append(text_to_words(data[c][i]))
            data[c] = clean_column
    elif isinstance(column, str):
        print("Cleaning %s..."%column)
        clean_column = []
        for i in range(0, len(data)):
            clean_column.append(text_to_words(data[column][i]))
        data[column] = clean_column
    else:
        print("Please input a string or a list of column name.")
    return data

## Raw Training Data

In [3]:
train = pd.read_csv("./data/train.csv")
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,Concept Housewares WR-44526 Solid-Wood Ceiling...,"Like a silent and sturdy tree, the Southern En...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb ...,"WTGR1011\nFeatures\nNickel base, 60,000 averag...",2,0.471


## Cleaned Training Data

In [4]:
warnings.filterwarnings('ignore')
train = train.fillna("")
train.drop(columns='id',inplace=True)
train_clean = cleaning(train, ["query", "product_title", "product_description"])
train_clean.head()

Cleaning query...
Cleaning product_title...
Cleaning product_description...


Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance
0,bridal shower decoration,accent pillow heart design red black,red satin accent pillow embroider heart black ...,1,0.0
1,lead christmas light,set battery operate multi lead train christmas...,set battery operate train christmas light item...,4,0.0
2,projector,viewsonic pro dlp multimedia projector,,4,0.471
3,wine rack,concept housewares wr solid wood ceiling wall ...,like silent sturdy tree southern enterprise bi...,4,0.0
4,light bulb,wintergreen light christmas lead light bulb pack,wtgr feature nickel base average hour acrylic ...,2,0.471


In [None]:
test = pd.read_csv("./data/test.csv")
test = test.fillna("")
idx = test["id"].values
test.drop(columns='id',inplace=True)
test_clean = cleaning(test, ["query", "product_title", "product_description"])

Cleaning query...
Cleaning product_title...


In [14]:
train_clean.to_csv("train_clean.csv", index=False)
test_clean.to_csv("test_clean.csv", index=False)