In [189]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_transform_pipeline import DataTransformPipeline
from combine_raw_data import get_combined_data

In [190]:
genre_data = get_combined_data()
genre_data.describe()

Unnamed: 0,author,book,genre,href,story
count,475,1032,1032,1032,1032
unique,318,4,115,1032,1032
top,ccmal,Princess-Diaries,Romance,/s/1657342/1/When-They-Were-Little,Laura\r\nanxiously sat on the stage. She was ...
freq,53,726,241,1,1


In [157]:
def split_genres(row):
    if "/" in row["genre"]:
        genres = row["genre"].split("/")
        for i, genre in enumerate(genres):
            row["genre{}".format(i+1)] = genre
        return row
    else:
        return row

splitted = genre_data.apply(split_genres, axis=1)

In [163]:
splitted.groupby("genre1")["story"].count()

genre1
Adventure        31
Angst            11
Crime             1
Drama            78
Family           15
Fantasy          12
Friendship        7
Horror            1
Humor            75
Hurt             18
Mystery           8
Parody            4
Poetry            8
Romance         369
Supernatural      5
Suspense          1
Tragedy           6
Western           2
Name: story, dtype: int64

In [159]:
splitted.groupby("genre2")["story"].count()

genre2
Adventure        21
Angst            29
Comfort          18
Drama           141
Family           16
Fantasy          18
Friendship       30
Horror            2
Humor           154
Hurt             23
Mystery           6
Parody            2
Poetry            5
Romance         155
Sci               4
Spiritual         1
Supernatural     10
Suspense          6
Tragedy           8
Western           3
Name: story, dtype: int64

In [164]:
splitted.groupby("genre3")["story"].count()

genre3
Comfort       23
Family         3
Friendship     4
Poetry         1
Romance        4
Name: story, dtype: int64

In [172]:
romances = genre_data[(splitted["genre1"] == "Romance") | (splitted["genre2"] == "Romance")]["story"]

In [179]:
romances.iloc[0].split(".")[2:4]

[" It's been a month since she left",
 " It's been a long and painful month of sitting in his new nest after his parents denied him as their son"]

In [180]:
pl = DataTransformPipeline(romances)
pl.add(SeriesTransforms.values)
pl.add(CondenseStoryTransforms.first_sentences, args = (2, 2))
pl.add(TextTransforms.to_strings)
pl.add(TextTransforms.tokenize)

tokenized_romances = pl.apply("tokenize_test")

Applying 'values'
Applying 'first_sentences'
Applying 'to_strings'
Applying 'tokenize'


In [181]:
" ".join(tokenized_romances[0])

"It 's month left . It 's long painful month sitting new nest parents denied son . Marus thrilled ."

In [182]:
pl2 = DataTransformPipeline(tokenized_romances)
pl2.add(VocabularyTransforms.to_word_embeddings)

embeds, idx_word_map, embedding_vocab = pl2.apply("tokenize_test")

Applying 'to_word_embeddings'
Reading files...
opening embeds/10k/wiki-news-300d-1M-embedding-1.pkl
opening embeds/10k/wiki-news-300d-1M-embedding-2.pkl


In [183]:
embeds[0]

tensor([  137,    24,   875,   214,     2,   137,    24,   387,  6373,   875,
         4386,   108,  8663,   856,  2806,  1066,     2, 10000, 10000,     2])

In [185]:
ply = DataTransformPipeline(romances)
ply.add(TextTransforms.to_categorical)
y = ply.apply("labels")

Applying 'to_categorical'
