In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_transform_pipeline import DataTransformPipeline
from combine_raw_data import get_combined_data

In [2]:
data = get_combined_data()
data.describe()

Unnamed: 0,author,book,genre,genre1,genre2,genre3,href,story
count,6181,9618,9618,9618,9618,3743,9618,9617
unique,4420,107,323,20,21,20,9618,9586
top,hexterah,Twilight,Romance,Romance,Romance,Romance,/s/12440675/1/Everything-Comes-Back-To-You,A/N: So I've decided to pull my stories. I hav...
freq,42,337,1109,3453,2284,1207,1,9


In [3]:
data.groupby("genre1")["story"].count().sort_values(ascending=False)

genre1
Romance         3453
Humor           1093
Adventure        931
Drama            794
Angst            596
Fantasy          425
Hurt             393
Friendship       369
Family           330
Poetry           235
Tragedy          204
Supernatural     179
Mystery          123
Horror           121
Sci              107
Suspense          94
Parody            81
Spiritual         45
Crime             35
Western            9
Name: story, dtype: int64

In [4]:
data.groupby("genre2")["story"].count().sort_values(ascending=False)

genre2
Romance         2283
Humor           1044
Drama            983
Adventure        732
Angst            696
Friendship       563
Hurt             555
Fantasy          450
Comfort          393
Family           336
Tragedy          295
Supernatural     217
Poetry           184
Suspense         177
Sci              167
Mystery          158
Parody           140
Horror           124
Spiritual         69
Crime             44
Western            7
Name: story, dtype: int64

In [5]:
data.groupby("genre3")["story"].count().sort_values(ascending=False)

genre3
Romance         1207
Comfort          555
Humor            457
Drama            236
Angst            230
Adventure        203
Friendship       159
Family           136
Poetry           127
Fantasy          109
Sci              108
Tragedy           67
Supernatural      43
Suspense          24
Mystery           22
Spiritual         20
Horror            19
Parody            10
Crime              9
Western            2
Name: story, dtype: int64

In [8]:
stories = data["story"].dropna()
stories.values[0].split(".")[:3]

['AN: I do not own The Chronicles of Prydain, alas, for I am clearly not Lloyd Alexander',
 ' A big thanks to CompanionWanderer for beta-ing this for me so VERY long ago',
 " I'm finally posting it in hopes that feedback will get me moving on it again"]

In [7]:
f_sents_pl = DataTransformPipeline()
f_sents_pl.add(SeriesTransforms.values)
f_sents_pl.add(CondenseStoryTransforms.first_sentence_tokens, args = (2, 2))
f_sents_pl.add(VocabularyTransforms.to_word_embeddings)

embeds, idx_word_map, embedding_vocab = f_sents_pl.apply("first_sentences", stories)

****************
Starting 'first_sentences' pipeline
****************

Applying 'values'
Applying 'first_sentence_tokens'
Applying 'to_word_embeddings'
Reading files...
opening embeds/45k/wiki-news-300d-1M-embedding-1.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-2.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-3.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-7.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-6.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-4.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-5.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-8.pkl


In [9]:
embeds[0]

tensor([   20, 33335, 45000,     0, 29404,     0,    28,   865,  8031,  2598,
            2,   104,   740,  2001, 45000, 10494,    35, 31870, 32365,   387,
          779,     2,    28,   719,  1620,  3002,  3328,  3488,  1236,     2])

In [10]:
" ".join([idx_word_map[idx.item()] for idx in embeds[0]])

"The Chronicles <unk> , alas , I clearly Lloyd Alexander . A big thanks <unk> beta - ing VERY long ago . I 'm finally posting hopes feedback moving ."

In [3]:
label_pipeline = DataTransformPipeline()
label_pipeline.add(TextTransforms.to_categorical)

for label in ["genre1", "genre2"]:
    label_pipeline.apply("{}_labels".format(label), data[label])

****************
Starting 'genre1_labels' pipeline
****************

Applying 'to_categorical'
****************
Starting 'genre2_labels' pipeline
****************

Applying 'to_categorical'
