In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_tranform_pipeline import DataTransformPipeline

In [2]:
def get_genre_data():
    if os.path.exists("raw_genre_data.pkl"):
        with open("raw_genre_data.pkl", "rb") as f:
            return pkl.load(f)
    
    # read/save the data if not exists
    genre_data = {}

    for file in os.listdir("data"):
        df = pd.read_csv("data/{}".format(file), sep="|", index_col=False)
        genre_data[file.lower().split(".")[0]] = df
    
    with open("raw_genre_data.pkl", "wb+") as f:
        pkl.dump(genre_data, f, pkl.HIGHEST_PROTOCOL)
    
    return genre_data

In [3]:
genre_data = get_genre_data()
romances = genre_data["romance"]["story"]

In [4]:
romances[0].split(".")[2:4]

[' The main pairing, as you probably already know, is Maven x Mare',
 ' I mean sure in the story they had a bit of problems, but I still ship them with my entire heart']

In [5]:
pl = DataTransformPipeline(romances)
pl.add(SeriesTransforms.values)
pl.add(CondenseStoryTransforms.first_sentences, args = (2, 2))
pl.add(TextTransforms.to_strings)
pl.add(TextTransforms.tokenize)

tokenized_romances = pl.apply("tokenize_test")

Applying 'values'
Applying 'first_sentences'
Applying 'to_strings'
Applying 'tokenize'


In [6]:
"".join(tokenized_romances[0])

'Themainpairing,probablyknow,MavenxMare.Imeansurestorybitproblems,Ishipentireheart.'

In [7]:
pl2 = DataTransformPipeline(tokenized_romances)
pl2.add(VocabularyTransforms.to_word_embeddings)

embeds, idx_word_map, embedding_vocab = pl2.apply("tokenize_test")

Applying 'to_word_embeddings'
Reading files...
opening embeds/10k/wiki-news-300d-1M-embedding-1.pkl
opening embeds/10k/wiki-news-300d-1M-embedding-2.pkl


In [8]:
embeds[0]

tensor([   20,  1197, 10000,     0,   419,   197,     0, 10000,  2256, 10000,
            2,    28,   625,   582,   413,  1303,   432,     0,    28,  1461,
         2264,  1342,     2])

In [9]:
ply = DataTransformPipeline(genre_data["romance"]["genre"])
ply.add(TextTransforms.to_categorical)
y = ply.apply("labels")

Applying 'to_categorical'


In [11]:
from data import Data

g = Data(embeds, y)
l = g.get_loader()
for o in l:
    m = o
    break

In [18]:
m[2].shape

torch.Size([21, 21])