# Libraries

In [1]:
import numpy as np
import pandas as pd

import re
import string

from IPython.display import clear_output

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
# Necessary one time downloads
# Uncomment to excute the code below

"""
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

clear_output()
"""

"\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n\nclear_output()\n"

In [5]:
# Import data
data = pd.read_csv("../data/clean_data.csv")
data

Unnamed: 0,title,genre,summary,word_count
0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,803
1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",563
2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,334
3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,776
4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,1190
...,...,...,...,...
3097,Wintersmith,fantasy,Tiffany Aching is a trainee witch — now workin...,132
3098,Fantastic Beasts and Where to Find Them: The O...,fantasy,J.K. Rowling's screenwriting debut is captured...,117
3099,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",144
3100,Red Rising,fantasy,"""I live for the dream that my children will be...",244


# Basic Cleaning

In [7]:
#cleaning unecessary text from the string 
def basic_clean(text):
    
    # Convert to lowerCase
    text = text.lower() 
    
    # removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text) 
    
    # Remove stopwords
    text_tokens = word_tokenize(text)
    tw = [word for word in text_tokens if not word in stopwords.words('english')]
    text = (" ").join(tw)
    
    split_text = text.split(' ')
    
    # Remove words with length<=3
    output = [x for x in split_text if len(x) > 3]
    text = (" ").join(output)
    
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text) # remove single character 
    text = re.sub('<.*?>+',' ',text) # remove HTML Tags
    text = re.sub('\n', ' ',text) # removal of new line characters
    text = re.sub(r'\s+', ' ',text) # removal of multiple spaces
    
    return text

In [8]:
# Apply basic cleaning to summary column
data['title'] = data['title'].apply(basic_clean)
data['summary'] = data['summary'].apply(basic_clean)
data.head()

Unnamed: 0,title,genre,summary,word_count
0,drowned wednesday,fantasy,drowned wednesday first trustee among morrow d...,803
1,lost hero,fantasy,book opens jason awakens school unable remembe...,563
2,eyes overworld,fantasy,cugel easily persuaded merchant fianosther att...,334
3,magic promise,fantasy,book opens herald mage vanyel returning countr...,776
4,taran wanderer,fantasy,taran gurgi returned caer dallben following ev...,1190


# Preprocessing

In [6]:
def data_preprocess(text):
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmetization
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

In [7]:
data['title_summary'] = data['title'].str.cat(data['summary'], sep = " ")

# Apply preprocessing to summary column
data['title_summary'] = data['title_summary'].apply(data_preprocess)
data.head()

Unnamed: 0,title,genre,summary,word_count,title_summary
0,drowned wednesday,fantasy,drowned wednesday first trustee among morrow d...,803,drowned wednesday drowned wednesday first trus...
1,lost hero,fantasy,book opens jason awakens school unable remembe...,563,lost hero book open jason awakens school unabl...
2,eyes overworld,fantasy,cugel easily persuaded merchant fianosther att...,334,eye overworld cugel easily persuaded merchant ...
3,magic promise,fantasy,book opens herald mage vanyel returning countr...,776,magic promise book open herald mage vanyel ret...
4,taran wanderer,fantasy,taran gurgi returned caer dallben following ev...,1190,taran wanderer taran gurgi returned caer dallb...


# Export preprocessed data

In [8]:
# Export data
data.to_csv( "../data/preprocess_data.csv", index=False, encoding='utf-8-sig')