In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_transform_pipeline import DataTransformPipeline
from combine_raw_data import get_combined_data, split_genres

In [2]:
data = get_combined_data()
data.describe()

Unnamed: 0,author,book,genre,genre1,genre2,genre3,href,story
count,475,1032,1032,652,652,35,1032,1032
unique,318,4,115,18,20,5,1032,1032
top,ccmal,Princess-Diaries,Romance,Romance,Romance,Comfort,/s/533186/1/Not-another-Mia-Michael-story,"Hello everyone! This is my new story, so you g..."
freq,53,726,241,369,155,23,1,1


In [3]:
data = data.apply(split_genres, axis=1)

In [8]:
data.groupby("genre1")["story"].count()

genre1
Adventure        38
Angst            21
Crime             1
Drama           116
Family           20
Fantasy          14
Friendship       12
Horror            3
Humor           113
Hurt             18
Mystery          11
Parody            7
Poetry           17
Romance         610
Sci               7
Supernatural      7
Suspense          2
Tragedy          13
Western           2
Name: story, dtype: int64

In [9]:
data.groupby("genre2")["story"].count()

genre2
Adventure        28
Angst            39
Comfort          18
Drama           179
Family           21
Fantasy          20
Friendship       35
Horror            4
Humor           192
Hurt             23
Mystery           9
Parody            5
Poetry           14
Romance         396
Sci              11
Spiritual         1
Supernatural     12
Suspense          7
Tragedy          15
Western           3
Name: story, dtype: int64

In [10]:
data.groupby("genre3")["story"].count()

genre3
Adventure         7
Angst            10
Comfort          23
Drama            38
Family            8
Fantasy           2
Friendship        9
Horror            2
Humor            38
Mystery           3
Parody            3
Poetry           10
Romance         245
Sci               7
Supernatural      2
Suspense          1
Tragedy           7
Name: story, dtype: int64

In [6]:
stories = data["story"]
stories.values[0].split(".")[4:6]

['" Ma said, sitting down in her own rocking chair',
 'They said like that for a while, just rocking gently back and forth, before Laura broke the silence']

In [7]:
f_sents_pl = DataTransformPipeline()
f_sents_pl.add(SeriesTransforms.values)
f_sents_pl.add(CondenseStoryTransforms.first_sentence_tokens, args = (2, 2))
f_sents_pl.add(VocabularyTransforms.to_word_embeddings)

embeds, idx_word_map, embedding_vocab = f_sents_pl.apply("first_sentences", stories)

****************
Starting 'first_sentences' pipeline
****************

Applying 'values'
Applying 'first_sentence_tokens'
Applying 'to_word_embeddings'
Reading files...
opening embeds/45k/wiki-news-300d-1M-embedding-1.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-2.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-3.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-7.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-6.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-4.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-5.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-8.pkl


In [8]:
embeds[0]

tensor([ 8250,    50,     0,  4386, 35755,  3408,     2,   354,    50,    66,
            0, 35755, 10498,  5725,     0,  6170,  2983, 45000,   651, 16484,
            0,  1882,    37,     8,   641,   824,     2])

In [10]:
" ".join([idx_word_map[idx.item()] for idx in embeds[0]])

'Ma said , sitting rocking chair . They said like , rocking gently forth , Laura broke <unk> stop ma , pain ? " She asked .'

In [11]:
label_pipeline = DataTransformPipeline()
label_pipeline.add(TextTransforms.to_categorical)

for label in ["genre1", "genre2"]:
    label_pipeline.apply("{}_labels".format(label), data[label])

****************
Starting 'genre1_labels' pipeline
****************

Applying 'to_categorical'
****************
Starting 'genre2_labels' pipeline
****************

Applying 'to_categorical'
