In [1]:
import nltk
import spacy
import unidecode
import contractions
import pandas as pd
from word2number import w2n
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("train.csv")

In [18]:
nlp = spacy.load('en_core_web_md')

#Delete HTML tags
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

#Delete extra white spaces
def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())

#Take away the accents
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit) 
    clean_text=' '.join(token for token in clean_text)
    return clean_text

In [19]:
df['description_cleaned'] = df['description'].apply(text_preprocessing)

**Checking all stop words are removed and we don't need to add more to the list**

In [20]:
combination=df['description_cleaned'].str.cat(sep=' ')
words = combination.split()
wordfreq = {}
for word in words:
    if word not in wordfreq:
        wordfreq[word] = 0 
    wordfreq[word] += 1

In [21]:
sorted(wordfreq.items(), key=lambda item: item[1], reverse=True)

[('sleeve', 13023),
 ('pocket', 11833),
 ('feature', 11090),
 ('button', 10199),
 ('fastening', 9879),
 ('long', 9410),
 ('cm', 8386),
 ('height', 7226),
 ('zip', 6258),
 ('round', 6239),
 ('detail', 5903),
 ('elastic', 5839),
 ('print', 5362),
 ('neck', 5188),
 ('model', 5077),
 ('neckline', 4987),
 ('hem', 4888),
 ('short', 4873),
 ('shirt', 4702),
 ('177', 4130),
 ('high', 4116),
 ('adjustable', 3663),
 ('69.6', 3547),
 ('trim', 3518),
 ('contrast', 3482),
 ('strap', 3343),
 ('waistband', 3314),
 ('waist', 3229),
 ('collar', 3144),
 ('metal', 3122),
 ('design', 3049),
 ('patch', 2935),
 ('fabric', 2803),
 ('t', 2704),
 ('cuff', 2583),
 ('applique', 2572),
 ('dress', 2543),
 ('cotton', 2398),
 ('trouser', 2291),
 ('fit', 2141),
 ('note', 2132),
 ('shoulder', 1990),
 ('v', 1936),
 ('leather', 1860),
 ('snap', 1740),
 ('flap', 1722),
 ('heel', 1620),
 ('welt', 1619),
 ('fragrance', 1556),
 ('closure', 1532),
 ('chest', 1523),
 ('length', 1481),
 ('vent', 1473),
 ('ruffle', 1463),
 ('co

In [22]:
df.head()

Unnamed: 0,name,description,description_cleaned
0,CROPPED JACKET TRF,Jacket made of a technical fabric with texture...,jacket technical fabric texture high collar lo...
1,OVERSIZED SHIRT WITH POCKET TRF,Oversized long sleeve shirt with a round colla...,oversized long sleeve shirt round collar featu...
2,TECHNICAL TROUSERS TRF,High-waist trousers with a matching elastic wa...,high waist trouser matching elastic waistband ...
3,SHIRT DRESS,Collared dress featuring sleeves falling below...,collared dress feature sleeve fall elbow cuff ...
4,PUFF SLEEVE DRESS WITH PLEATS TRF,Loose-fitting midi dress with a round neckline...,loose fitting midi dress round neckline short ...


In [23]:
df.drop('description', axis=1, inplace=True)
df.head()

Unnamed: 0,name,description_cleaned
0,CROPPED JACKET TRF,jacket technical fabric texture high collar lo...
1,OVERSIZED SHIRT WITH POCKET TRF,oversized long sleeve shirt round collar featu...
2,TECHNICAL TROUSERS TRF,high waist trouser matching elastic waistband ...
3,SHIRT DRESS,collared dress feature sleeve fall elbow cuff ...
4,PUFF SLEEVE DRESS WITH PLEATS TRF,loose fitting midi dress round neckline short ...


In [24]:
df.to_csv('Cleaned_train.csv')