In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_tranform_pipeline import DataTransformPipeline

In [2]:
def get_genre_data():
    if os.path.exists("raw_genre_data.pkl"):
        with open("raw_genre_data.pkl", "rb") as f:
            return pkl.load(f)
    
    # read/save the data if not exists
    genre_data = {}

    for file in os.listdir("data"):
        df = pd.read_csv("data/{}".format(file), sep="|", index_col=False)
        genre_data[file.lower().split(".")[0]] = df
    
    with open("raw_genre_data.pkl", "wb+") as f:
        pkl.dump(genre_data, f, pkl.HIGHEST_PROTOCOL)
    
    return genre_data

In [7]:
genre_data = get_genre_data()
romances = genre_data["romance"]["story"]

In [18]:
romances[0].split(".")[2:4]

[' The main pairing, as you probably already know, is Maven x Mare',
 ' I mean sure in the story they had a bit of problems, but I still ship them with my entire heart']

In [4]:
pl = DataTransformPipeline(romances)
pl.add(SeriesTransforms.values)
pl.add(CondenseStoryTransforms.first_sentences, args = (2, 2))
pl.add(TextTransforms.to_strings)
pl.add(TextTransforms.tokenize)

tokenized_romances = pl.apply("tokenize_test")

Applying 'values'
Applying 'first_sentences'
Applying 'to_strings'
Applying 'tokenize'


In [16]:
" ".join(tokenized_romances[0])

'The main pairing , probably know , Maven x Mare . I mean sure story bit problems , I ship entire heart .'