In [17]:
import pandas as pd

df = pd.read_csv('story_data.csv')

In [18]:
df.shape

(18248, 12)

We have 18,248 stories (samples) and 12 features

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18248 entries, 0 to 18247
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18248 non-null  object
 1   story_id        18248 non-null  object
 2   soup            18248 non-null  object
 3   title           18248 non-null  object
 4   author          18246 non-null  object
 5   contest_num     18248 non-null  object
 6   won_contest     236 non-null    object
 7   categories      18248 non-null  object
 8   num_likes       18248 non-null  int64 
 9   num_comments    18248 non-null  int64 
 10  story_html      18248 non-null  object
 11  date_published  18248 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.7+ MB


'won_contest' only has a value if the story was a winner or shortlisted

In [20]:
df.head()

Unnamed: 0,url,story_id,soup,title,author,contest_num,won_contest,categories,num_likes,num_comments,story_html,date_published
0,https://blog.reedsy.com/short-story/n1pl7n/,n1pl7n,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,11:59,Krystal Pepper,"['Contest', '#137']",,"['Crime', 'Fiction', 'Thriller']",17,3,"<article class=""font-alt submission-content sp...","March 11, 2022 17:27"
1,https://blog.reedsy.com/short-story/yv1ahb/,yv1ahb,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,Want Ads,Nina Wishnat,"['Contest', '#30']",,"['Contemporary', 'Fiction']",1,0,"<article class=""font-alt submission-content sp...","February 28, 2020 15:27"
2,https://blog.reedsy.com/short-story/kv1cr7/,kv1cr7,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,To be wrong because of sincerity...,Lis Lovén,"['Contest', '#102']",,"['Black', 'Contemporary', 'Fiction']",12,0,"<article class=""font-alt submission-content sp...","July 10, 2021 14:21"
3,https://blog.reedsy.com/short-story/v2nqtq/,v2nqtq,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,"The Shiva, 1955",Tammy Kl,"['Contest', '#100']",,"['American', 'Coming', 'of', 'Age', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 01, 2021 18:13"
4,https://blog.reedsy.com/short-story/nptt18/,nptt18,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,The Monster of Greentree,Best Christopher,"['Contest', '#102']",,"['Coming', 'of', 'Age', 'Adventure', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 17, 2021 01:23"


In [21]:
df.describe()

Unnamed: 0,num_likes,num_comments
count,18248.0,18248.0
mean,14.1387,4.295758
std,18.742977,17.676338
min,0.0,0.0
25%,7.0,0.0
50%,10.0,1.0
75%,14.0,3.0
max,503.0,629.0


two numeric features: 'num_likes' is target feature for regression problem

features to extract:
tokenize words
word related features


extracting some word-related features:

In [22]:
from bs4 import BeautifulSoup
from nltk.tokenize import wordpunct_tokenize, sent_tokenize, word_tokenize
import numpy as np

import nltk
nltk.download('punkt')

In [88]:
def separate_words_and_punct(arr_tokenized_text):
    words = []
    punctuation = []
    for i, value in enumerate(arr_tokenized_text):
        if len(value) == 1 and not value.isalpha():
            punctuation.append(i)
        else:
            words.append(i)
    return arr_tokenized_text[words], arr_tokenized_text[punctuation]

In [89]:
def extract_word_feats(story_text):
    story_word_toks = word_tokenize(story_text)
    words, punctuation = separate_words_and_punct(np.array(story_word_toks, dtype='object'))
    num_words = len(words)
    unique_words = set(words)
    num_unique_words = len(unique_words)
    unique_words_percent = num_unique_words / num_words
    return story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent

In [90]:
def extract_sentence_feats(story_text):
    story_sent = sent_tokenize(story_text)
    num_sent = len(story_sent)
    words_per_sent = np.array([(len(word_tokenize(t)), word_tokenize(t)) for t in story_sent], dtype='object')
    sent_lengths = words_per_sent[:, 0].astype(int)
    sent_words = words_per_sent[:, 1]
    sent_avg_length = sent_lengths.mean()
    return story_sent, num_sent, sent_lengths, sent_avg_length, sent_words

In [91]:
def tokenize_story(row):
    # extracting story from html
    html = row['story_html']
    soup = BeautifulSoup(html, "html.parser")
    story = soup.get_text()
    # tokenizing by sentence
    story_sent, num_sent, sent_lengths, sent_avg_length, sent_words = extract_sentence_feats(story)
    # tokenizing by word
    story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent = extract_word_feats(story)
        
    feat_dict = {'story_sent': story_sent, 'num_sent': num_sent, 'sent_lengths': sent_lengths, 
                 'sent_avg_length': sent_avg_length, 'sent_words': sent_words, 'story_word_toks': story_word_toks, 
                 'words': words, 'punctuation': punctuation, 'num_words': num_words, 'unique_words': unique_words, 
                 'num_unique_words': num_unique_words, 'unique_words_percent': unique_words_percent}
    return pd.DataFrame.from_dict(feat_dict, orient='index').transpose()

In [92]:
new_feats = ('num_sent', 'words', 'num_words', 'unique_words', 'num_unique_words', 'unique_words_percent','punctuation')
results = df.apply(tokenize_story, axis=1)

TypeError: object of type 'int' has no len()

In [None]:
feats_df = pd.DataFrame()
for sub in results:
    feats_df = pd.concat([feats_df, sub])
feats_df = feats_df.reset_index()
feats_df.drop('index', axis=1, inplace=True)
feats_df.head()

In [None]:
feats_df.shape, df.shape

7 new features extracted

In [None]:
new_df = pd.concat([df, feats_df], axis=1)
new_df

extracting and encoding categories

In [None]:
def find_num_cats(row):
    return len(row['categories'].split())

In [None]:
new_df['num_cats'] = new_df.apply(find_num_cats, axis=1)

In [None]:
new_df.head()

In [None]:
new_df.shape

In [None]:
import re
cats = set()
for sample, num in new_df[['categories', 'num_cats']].values:
    samp_cats = re.findall('[A-Z][a-z]+', sample)
    cats.update(samp_cats)
print(f'we have {len(cats)} categories')

In [None]:
for cat in cats:
    new_df[f'cat_{cat}'] = (new_df['categories']).astype('category').str.contains(cat).astype(int)
new_df.shape

In [None]:
new_df['cat_Fiction'].sum() == len(new_df)

In [None]:
new_df.drop('categories', axis=1, inplace=True)
new_df.shape

encoding date as datetime

In [None]:
new_df['date_published'] = pd.to_datetime(new_df['date_published'])
new_df['date_published']

In [None]:
new_df.shape

In [None]:
new_df.info()

In [None]:
new_df.describe()

In [None]:
new_df.to_csv('story_data_new_feats.csv', index=False)

In [None]:
other = pd.read_csv('story_data_new_feats.csv')

In [None]:
other.shape