In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_transform_pipeline import DataTransformPipeline
from combine_raw_data import get_combined_data

In [2]:
data = get_combined_data()
data.describe()

Unnamed: 0,author,book,genre,genre1,genre2,genre3,href,story
count,51799,77759,77759,77759,77759,30760,77759,77759
unique,33033,802,321,20,21,20,77759,77590
top,Laura Schiller,Hobbit,Romance,Romance,Romance,Romance,/s/2379583/1/Improbability-Kissing,I am excited to announce the start of my new w...
freq,208,461,9205,29516,18594,10256,1,12


In [3]:
data.groupby("genre1")["story"].count().sort_values(ascending=False)

genre1
Romance         29516
Humor            8187
Drama            6404
Adventure        5271
Angst            4698
Hurt             3885
Friendship       3374
Family           3089
Fantasy          2984
Tragedy          1799
Supernatural     1607
Horror           1540
Poetry           1161
Sci              1087
Mystery          1035
Suspense          716
Parody            632
Spiritual         477
Crime             269
Western            28
Name: story, dtype: int64

In [4]:
data.groupby("genre2")["story"].count().sort_values(ascending=False)

genre2
Romance         18594
Humor            8607
Drama            8201
Adventure        5198
Angst            5040
Hurt             4977
Friendship       4975
Comfort          3885
Family           3271
Fantasy          2494
Tragedy          2346
Supernatural     2130
Sci              1731
Horror           1319
Mystery          1291
Suspense         1214
Parody           1019
Spiritual         599
Poetry            481
Crime             346
Western            41
Name: story, dtype: int64

In [5]:
data.groupby("genre3")["story"].count().sort_values(ascending=False)

genre3
Romance         10256
Comfort          4977
Humor            3328
Drama            1919
Angst            1849
Friendship       1590
Family           1413
Adventure        1319
Sci              1101
Fantasy           855
Tragedy           640
Horror            349
Supernatural      343
Spiritual         203
Mystery           185
Parody            169
Suspense          164
Crime              53
Poetry             39
Western             8
Name: story, dtype: int64

In [8]:
stories = data["story"].dropna()
stories.values[0].split(".")[:3]

['AN: I do not own The Chronicles of Prydain, alas, for I am clearly not Lloyd Alexander',
 ' A big thanks to CompanionWanderer for beta-ing this for me so VERY long ago',
 " I'm finally posting it in hopes that feedback will get me moving on it again"]

In [3]:
f_sents_pl = DataTransformPipeline()
f_sents_pl.add(SeriesTransforms.values)
f_sents_pl.add(CondenseStoryTransforms.first_sentence_tokens, args = (2, 2))
f_sents_pl.add(VocabularyTransforms.to_word_embeddings)

<data_transform_pipeline.DataTransformPipeline at 0x11a193828>

In [None]:
embeds, idx_word_map, embedding_vocab = f_sents_pl.apply("first_sentences", stories)

In [9]:
embeds[0]

tensor([   20, 33335, 45000,     0, 29404,     0,    28,   865,  8031,  2598,
            2,   104,   740,  2001, 45000, 10494,    35, 31870, 32365,   387,
          779,     2,    28,   719,  1620,  3002,  3328,  3488,  1236,     2])

In [10]:
" ".join([idx_word_map[idx.item()] for idx in embeds[0]])

"The Chronicles <unk> , alas , I clearly Lloyd Alexander . A big thanks <unk> beta - ing VERY long ago . I 'm finally posting hopes feedback moving ."

In [3]:
label_pipeline = DataTransformPipeline()
label_pipeline.add(TextTransforms.to_categorical)

<data_transform_pipeline.DataTransformPipeline at 0x11438b710>

In [None]:
for label in ["genre1", "genre2"]:
    label_pipeline.apply("{}_labels".format(label), data[label])

In [5]:
romance = data[data["genre2"] == "Romance"]
humor = data[data["genre2"] == "Humor"]
drama = data[data["genre2"] == "Drama"]

comb = drama.append(
        romance.iloc[:drama.shape[0]]
    ).append(
        humor.iloc[:drama.shape[0]]
    )

In [5]:
comb.groupby("genre2").count()

Unnamed: 0_level_0,author,book,genre,genre1,genre3,href,story
genre2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Drama,5053,8201,8201,8201,1715,8201,8201
Humor,5475,8201,8201,8201,3278,8201,8201
Romance,5940,8201,8201,8201,6827,8201,8201


In [6]:
f_sents_pl.apply("f_sents_prod", comb["story"])

****************
Starting 'f_sents_prod' pipeline
****************

Applying 'values'
Applying 'first_sentence_tokens'
Applying 'to_word_embeddings'
Reading files...
opening embeds/45k/wiki-news-300d-1M-embedding-1.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-2.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-3.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-7.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-6.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-4.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-5.pkl
opening embeds/45k/wiki-news-300d-1M-embedding-8.pkl


([tensor([ 1889,  1845, 43943,  3054, 10318,     0, 45000, 45000,    28,   719,
            582,     2, 11457,     9]),
  tensor([39455,  3130,  4427,  6106,     0,   419,   502, 45000,     0,    28,
          22569,     0, 31573,   951, 45000,   237,    35,    90,  2346,    90,
            383,    90,  1362,  2285,     2,    20, 45000,  2886,  3860,     0,
            419,   452,  2315,   689,  1540,     2]),
  tensor([ 5137,     9, 45000,  1845, 43943,  1027,     2, 17981,  1252,  5137,
            468,    65,  1057,     2]),
  tensor([  137,    66,  2284,    35, 21103,  2032,  1704,     2, 14814,  4848,
          40680,  2284,    35, 45000,  9192,  2243,    23,  2548,     2,  3220,
          45000,     0,  1286,   295,  1819, 45000,   640,    35,  8764,  1709,
              0, 45000,  6284,     2]),
  tensor([  137,    24, 45000,  1172,  1481,     2,    28,  1611,   302,     0,
             28, 45000,  1369,     2,   137, 10864, 17411,     2]),
  tensor([13241,    24,  3397,  3675, 

In [6]:
label_pipeline.apply("genre2_label _prod", comb["genre2"])

****************
Starting 'genre2_label _prod' pipeline
****************

Applying 'to_categorical'


(372      0
 373      0
 374      0
 375      0
 376      0
 377      0
 378      0
 379      0
 380      0
 381      0
 382      0
 383      0
 384      0
 385      0
 386      0
 387      0
 388      0
 389      0
 390      0
 391      0
 392      0
 393      0
 394      0
 395      0
 396      0
 397      0
 398      0
 399      0
 400      0
 401      0
         ..
 67535    1
 67536    1
 67537    1
 67538    1
 67539    1
 67540    1
 67541    1
 67542    1
 67543    1
 67544    1
 67545    1
 67546    1
 67547    1
 67548    1
 67549    1
 67550    1
 67551    1
 67552    1
 67553    1
 67554    1
 67555    1
 67556    1
 67557    1
 67558    1
 67559    1
 67560    1
 67561    1
 67562    1
 67563    1
 67564    1
 Length: 24603, dtype: int8, {0: 'Drama', 1: 'Humor', 2: 'Romance'})