In [1]:
import pandas as pd
import os
import pickle as pkl
from data_transforms import CondenseStoryTransforms, VocabularyTransforms, TextTransforms, SeriesTransforms
from data_transform_pipeline import DataTransformPipeline
from combine_raw_data import get_combined_data

In [2]:
data = get_combined_data()
data.describe()

Unnamed: 0,author,book,genre,genre1,genre2,genre3,href,story
count,51799,77759,77759,77759,77759,30760,77759,77759
unique,33033,802,321,20,21,20,77759,77590
top,Laura Schiller,Hobbit,Romance,Romance,Romance,Romance,/s/2379583/1/Improbability-Kissing,I am excited to announce the start of my new w...
freq,208,461,9205,29516,18594,10256,1,12


In [3]:
data.groupby("genre1")["story"].count().sort_values(ascending=False)

genre1
Romance         29516
Humor            8187
Drama            6404
Adventure        5271
Angst            4698
Hurt             3885
Friendship       3374
Family           3089
Fantasy          2984
Tragedy          1799
Supernatural     1607
Horror           1540
Poetry           1161
Sci              1087
Mystery          1035
Suspense          716
Parody            632
Spiritual         477
Crime             269
Western            28
Name: story, dtype: int64

In [4]:
data.groupby("genre2")["story"].count().sort_values(ascending=False)

genre2
Romance         18594
Humor            8607
Drama            8201
Adventure        5198
Angst            5040
Hurt             4977
Friendship       4975
Comfort          3885
Family           3271
Fantasy          2494
Tragedy          2346
Supernatural     2130
Sci              1731
Horror           1319
Mystery          1291
Suspense         1214
Parody           1019
Spiritual         599
Poetry            481
Crime             346
Western            41
Name: story, dtype: int64

In [14]:
data.groupby("genre3")["story"].count()[["Adventure", "Romance"]].idxmin()

'Adventure'

In [8]:
stories = data["story"].dropna()
stories.values[0].split(".")[:3]

['AN: I do not own The Chronicles of Prydain, alas, for I am clearly not Lloyd Alexander',
 ' A big thanks to CompanionWanderer for beta-ing this for me so VERY long ago',
 " I'm finally posting it in hopes that feedback will get me moving on it again"]

In [3]:
f_sents_pl = DataTransformPipeline()
f_sents_pl.add(SeriesTransforms.values)
f_sents_pl.add(CondenseStoryTransforms.first_sentence_tokens, args = (2, 2))
f_sents_pl.add(VocabularyTransforms.to_word_embeddings)

<data_transform_pipeline.DataTransformPipeline at 0x11a193828>

In [None]:
embeds, idx_word_map, embedding_vocab = f_sents_pl.apply("first_sentences", stories)

In [9]:
embeds[0]

tensor([   20, 33335, 45000,     0, 29404,     0,    28,   865,  8031,  2598,
            2,   104,   740,  2001, 45000, 10494,    35, 31870, 32365,   387,
          779,     2,    28,   719,  1620,  3002,  3328,  3488,  1236,     2])

In [10]:
" ".join([idx_word_map[idx.item()] for idx in embeds[0]])

"The Chronicles <unk> , alas , I clearly Lloyd Alexander . A big thanks <unk> beta - ing VERY long ago . I 'm finally posting hopes feedback moving ."

In [3]:
label_pipeline = DataTransformPipeline()
label_pipeline.add(TextTransforms.to_categorical)

<data_transform_pipeline.DataTransformPipeline at 0x11438b710>

In [None]:
for label in ["genre1", "genre2"]:
    label_pipeline.apply("{}_labels".format(label), data[label])

In [29]:
def get_genres(genres):
    min_genre = data.groupby("genre2")["story"].count()[genres].idxmin()
    samples = data[data["genre2"] == min_genre].shape[0]
    df = data[data["genre2"] == min_genre]
    genres.pop(genres.index(min_genre))
    for genre in genres:
        df = df.append(data[data["genre2"] == genre][:samples])
    
    return df

In [31]:
comb = get_genres(["Romance", "Humor", "Drama"])

In [None]:
f_sents_pl.apply("f_sents_prod", comb["story"])

In [None]:
label_pipeline.apply("genre2_labels_prod", comb["genre2"])