# Import file and useful modules

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install swifter
import swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/Final Project/story_data.csv')

# Light EDA

In [None]:
df.shape

(18248, 12)

We have 18,248 stories (samples) and 12 features

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18248 entries, 0 to 18247
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18248 non-null  object
 1   story_id        18248 non-null  object
 2   soup            18248 non-null  object
 3   title           18248 non-null  object
 4   author          18246 non-null  object
 5   contest_num     18248 non-null  object
 6   won_contest     236 non-null    object
 7   categories      18248 non-null  object
 8   num_likes       18248 non-null  int64 
 9   num_comments    18248 non-null  int64 
 10  story_html      18248 non-null  object
 11  date_published  18248 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.7+ MB


'won_contest' only has a value if the story was a winner or shortlisted

In [None]:
df.head()

Unnamed: 0,url,story_id,soup,title,author,contest_num,won_contest,categories,num_likes,num_comments,story_html,date_published
0,https://blog.reedsy.com/short-story/n1pl7n/,n1pl7n,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,11:59,Krystal Pepper,"['Contest', '#137']",,"['Crime', 'Fiction', 'Thriller']",17,3,"<article class=""font-alt submission-content sp...","March 11, 2022 17:27"
1,https://blog.reedsy.com/short-story/yv1ahb/,yv1ahb,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,Want Ads,Nina Wishnat,"['Contest', '#30']",,"['Contemporary', 'Fiction']",1,0,"<article class=""font-alt submission-content sp...","February 28, 2020 15:27"
2,https://blog.reedsy.com/short-story/kv1cr7/,kv1cr7,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,To be wrong because of sincerity...,Lis Lovén,"['Contest', '#102']",,"['Black', 'Contemporary', 'Fiction']",12,0,"<article class=""font-alt submission-content sp...","July 10, 2021 14:21"
3,https://blog.reedsy.com/short-story/v2nqtq/,v2nqtq,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,"The Shiva, 1955",Tammy Kl,"['Contest', '#100']",,"['American', 'Coming', 'of', 'Age', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 01, 2021 18:13"
4,https://blog.reedsy.com/short-story/nptt18/,nptt18,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,The Monster of Greentree,Best Christopher,"['Contest', '#102']",,"['Coming', 'of', 'Age', 'Adventure', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 17, 2021 01:23"


In [None]:
df.describe()

Unnamed: 0,num_likes,num_comments
count,18248.0,18248.0
mean,14.1387,4.295758
std,18.742977,17.676338
min,0.0,0.0
25%,7.0,0.0
50%,10.0,1.0
75%,14.0,3.0
max,503.0,629.0




<code>'num_likes'</code> is the target feature for regression problem

# Feature extraction

Features to extract:

*   num_sent (number of sentences)
*   story_sent (list of sentences)
*   sent_lengths (list of lengths of sentences)
*   sent_avg_length (average length of sentences)
*   sent_words (list of lists of sentences' words
*   story_word_toks
*   words
*   punctuation
*   num_words
*   unique_words
*   num_unique_words
*   unique_words_percent

## extracting NLP features:

In [None]:
from bs4 import BeautifulSoup
from nltk.tokenize import wordpunct_tokenize, sent_tokenize, word_tokenize
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def separate_words_and_punct(arr_tokenized_text):
    words = []
    punctuation = []
    for i, value in enumerate(arr_tokenized_text):
        if len(value) == 1 and not value.isalpha():
            punctuation.append(i)
        else:
            words.append(i)
    return arr_tokenized_text[words], arr_tokenized_text[punctuation]

In [None]:
def extract_word_feats(story_text):
    story_word_toks = word_tokenize(story_text)
    words, punctuation = separate_words_and_punct(np.array(story_word_toks, dtype='object'))
    num_words = len(words)
    unique_words = set(words)
    num_unique_words = len(unique_words)
    unique_words_percent = num_unique_words / num_words
    return story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent

In [None]:
def extract_sentence_feats(story_text):
    story_sent = sent_tokenize(story_text)
    num_sent = len(story_sent)
    words_per_sent = np.array([(len(word_tokenize(t)), word_tokenize(t)) for t in story_sent], dtype='object')
    sent_lengths = words_per_sent[:, 0].astype(int)
    sent_words = words_per_sent[:, 1]
    sent_avg_length = sent_lengths.mean()
    return story_sent, num_sent, sent_lengths, sent_avg_length, sent_words

In [None]:
def tokenize_story(row):
    # extracting story from html
    html = row['story_html']
    soup = BeautifulSoup(html, "html.parser")
    story = soup.get_text()
    # tokenizing by sentence
    story_sent, num_sent, sent_lengths, sent_avg_length, sent_words = extract_sentence_feats(story)
    # tokenizing by word
    story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent = extract_word_feats(story)
        
    feat_dict = {'num_sent': num_sent, 'story_sent': story_sent, 'sent_lengths': sent_lengths, 
                 'sent_avg_length': sent_avg_length, 'sent_words': sent_words, 'story_word_toks': story_word_toks, 
                 'words': words, 'punctuation': punctuation, 'num_words': num_words, 'unique_words': unique_words, 
                 'num_unique_words': num_unique_words, 'unique_words_percent': unique_words_percent}
    return pd.DataFrame.from_dict(feat_dict, orient='index').transpose()

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
new_feats = ('num_sent', 'words', 'num_words', 'unique_words', 'num_unique_words', 'unique_words_percent','punctuation')
# results = df.progress_apply(tokenize_story, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
results = df.swifter.progress_apply(tokenize_story, axis=1)

In [None]:
feats_df = pd.DataFrame()
for sub in tqdm(results):
    feats_df = pd.concat([feats_df, sub])
feats_df = feats_df.reset_index()
feats_df.drop('index', axis=1, inplace=True)
feats_df.head(2)

  0%|          | 0/18248 [00:00<?, ?it/s]

Unnamed: 0,num_sent,story_sent,sent_lengths,sent_avg_length,sent_words,story_word_toks,words,punctuation,num_words,unique_words,num_unique_words,unique_words_percent
0,125,[\n11:59 \n\tI shoot straight out of bed to a...,"[17, 10, 16, 12, 14, 21, 25, 23, 12, 11, 36, 5...",24.016,"[[11:59, I, shoot, straight, out, of, bed, to,...","[11:59, I, shoot, straight, out, of, bed, to, ...","[11:59, I, shoot, straight, out, of, bed, to, ...","[., ., ., ., ., ,, ,, ., ,, ,, ., ,, ., ,, ., ...",2649,"{key, practice, get, bullet, So, possibility, ...",775,0.292563
1,87,"[\nMia’s room was pretty much bare., She lay o...","[9, 19, 4, 23, 23, 9, 5, 6, 13, 12, 19, 8, 13,...",19.54023,"[[Mia, ’, s, room, was, pretty, much, bare, .]...","[Mia, ’, s, room, was, pretty, much, bare, ., ...","[Mia, s, room, was, pretty, much, bare, She, l...","[’, ., ,, ., ., ,, ., ., ., ., ., “, ?, ’, ,, ...",1453,"{firm, account, key, narrow, get, expectant, 2...",619,0.426015
2,150,"[\n \nLin had to figure out things, because ob...","[26, 27, 10, 23, 24, 27, 21, 2, 16, 18, 20, 25...",15.0,"[[Lin, had, to, figure, out, things, ,, becaus...","[Lin, had, to, figure, out, things, ,, because...","[Lin, had, to, figure, out, things, because, o...","[,, ,, ., ,, ,, ., ., ,, ., ,, ,, ., ,, ,, ., ...",2008,"{having, tone, Act, get, turn, affairs, everyw...",676,0.336653
3,153,"[\n The Shiva, 1955\n \n\tFor the rest of her ...","[24, 12, 36, 12, 23, 19, 49, 20, 28, 27, 26, 5...",21.346405,"[[The, Shiva, ,, 1955, For, the, rest, of, her...","[The, Shiva, ,, 1955, For, the, rest, of, her,...","[The, Shiva, 1955, For, the, rest, of, her, li...","[,, ,, ., ,, ., –, ,, ,, ,, ’, ., (, !, ), ,, ...",2735,"{bacon, get, So, turn, middle, couldn, beneath...",994,0.363437
4,186,[\nThe wind whipped through my hair as I sped ...,"[20, 17, 15, 17, 14, 26, 15, 27, 15, 21, 13, 7...",17.951613,"[[The, wind, whipped, through, my, hair, as, I...","[The, wind, whipped, through, my, hair, as, I,...","[The, wind, whipped, through, my, hair, as, I,...","[’, ,, ., ., ,, ,, ., ,, ,, ., ,, ., ,, ,, ’, ...",2773,"{danger, get, turn, expanded, run, let, middle...",863,0.311215


In [None]:
feats_df.shape, df.shape

((18248, 12), (18248, 12))

12 new features extracted

In [None]:
new_df = pd.concat([df, feats_df], axis=1)
new_df.head(2)

Unnamed: 0,url,story_id,soup,title,author,contest_num,won_contest,categories,num_likes,num_comments,...,sent_lengths,sent_avg_length,sent_words,story_word_toks,words,punctuation,num_words,unique_words,num_unique_words,unique_words_percent
0,https://blog.reedsy.com/short-story/n1pl7n/,n1pl7n,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,11:59,Krystal Pepper,"['Contest', '#137']",,"['Crime', 'Fiction', 'Thriller']",17,3,...,"[17, 10, 16, 12, 14, 21, 25, 23, 12, 11, 36, 5...",24.016,"[[11:59, I, shoot, straight, out, of, bed, to,...","[11:59, I, shoot, straight, out, of, bed, to, ...","[11:59, I, shoot, straight, out, of, bed, to, ...","[., ., ., ., ., ,, ,, ., ,, ,, ., ,, ., ,, ., ...",2649,"{key, practice, get, bullet, So, possibility, ...",775,0.292563
1,https://blog.reedsy.com/short-story/yv1ahb/,yv1ahb,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,Want Ads,Nina Wishnat,"['Contest', '#30']",,"['Contemporary', 'Fiction']",1,0,...,"[9, 19, 4, 23, 23, 9, 5, 6, 13, 12, 19, 8, 13,...",19.54023,"[[Mia, ’, s, room, was, pretty, much, bare, .]...","[Mia, ’, s, room, was, pretty, much, bare, ., ...","[Mia, s, room, was, pretty, much, bare, She, l...","[’, ., ,, ., ., ,, ., ., ., ., ., “, ?, ’, ,, ...",1453,"{firm, account, key, narrow, get, expectant, 2...",619,0.426015


## extracting age of stories (in days)

In [None]:
from datetime import date, datetime
def calculate_age(row):
  born = row['date_published']
  today = datetime.strptime('10/6/2022', "%m/%d/%Y")
  row['age'] = (today - born).days
  return row

In [None]:
new_df['date_published'] = pd.to_datetime(new_df['date_published'])

In [None]:
new_df = new_df.progress_apply(calculate_age, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

## Extracting Percent of words in English corpus Brown

In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown_corpus = brown.words()
len(brown_corpus)
brown_corpus = list(set(brown_corpus))

In [None]:
def words_exist_per_cent(row):
  row['words_exist_per_cent'] = 100*len(list(set(row['unique_words'].lower().split("', '")) & set(brown_corpus)))/len(row['unique_words'].lower().split("', '"))
  return row

df = df.apply(words_exist_per_cent, axis=1)

## extracting and encoding categories

In [None]:
def find_num_cats(row):
    return len(row['categories'].split())

In [None]:
new_df['num_cats'] = new_df.progress_apply(find_num_cats, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
new_df.shape

(18248, 26)

In [None]:
import re
cats = set()
for sample, num in new_df[['categories', 'num_cats']].values:
    samp_cats = re.findall('[A-Z][a-z]+', sample)
    cats.update(samp_cats)
print(f'we have {len(cats)} categories')

we have 49 categories


In [None]:
for cat in tqdm(cats):
    new_df[f'cat_{cat}'] = (new_df['categories']).astype('category').str.contains(cat).astype(int)
new_df.shape

  0%|          | 0/49 [00:00<?, ?it/s]

(18248, 75)

In [None]:
new_df['cat_Fiction'].sum() == len(new_df)

True

In [None]:
new_df.drop('categories', axis=1, inplace=True)
new_df.shape

(18248, 74)

In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18248 entries, 0 to 18247
Data columns (total 74 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   url                   18248 non-null  object        
 1   story_id              18248 non-null  object        
 2   soup                  18248 non-null  object        
 3   title                 18248 non-null  object        
 4   author                18246 non-null  object        
 5   contest_num           18248 non-null  object        
 6   won_contest           236 non-null    object        
 7   num_likes             18248 non-null  int64         
 8   num_comments          18248 non-null  int64         
 9   story_html            18248 non-null  object        
 10  date_published        18248 non-null  datetime64[ns]
 11  num_sent              18248 non-null  int64         
 12  story_sent            18248 non-null  object        
 13  sent_lengths    

In [None]:
new_df.describe()

Unnamed: 0,num_likes,num_comments,num_sent,sent_avg_length,num_words,num_unique_words,unique_words_percent,age,num_cats,cat_Creative,...,cat_Drama,cat_School,cat_Science,cat_Happy,cat_Transgender,cat_Urban,cat_Kids,cat_Middle,cat_Historical,cat_People
count,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,...,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0,18248.0
mean,14.1387,4.295758,120.055677,19.2768,1772.405031,653.573707,0.381445,502.618972,2.95046,0.006247,...,0.157223,0.02066,0.042854,0.033483,0.001425,0.01244,0.014961,0.005425,0.016221,0.003781
std,18.742977,17.676338,54.727511,20.451749,663.541656,195.346379,0.053579,160.524603,1.175209,0.078794,...,0.364021,0.142247,0.202533,0.179899,0.037721,0.110841,0.121398,0.073458,0.126328,0.061377
min,0.0,0.0,1.0,4.197222,410.0,187.0,0.143804,124.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,0.0,80.0,14.826714,1191.0,494.0,0.343524,430.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.0,1.0,108.0,17.5,1568.0,607.0,0.381038,518.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14.0,3.0,151.0,20.910654,2265.0,793.0,0.417994,621.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,503.0,629.0,414.0,1459.0,4620.0,1508.0,0.638215,1154.0,8.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Output to csv

In [None]:
new_df.to_csv('drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

In [None]:
# other = pd.read_csv('story_data_new_feats.csv')

In [None]:
# other.shape

(18248, 73)