# Import file and useful modules

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install swifter
import swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/Final Project/story_data.csv')

# Light EDA

In [None]:
df.shape

(18248, 12)

We have 18,248 stories (samples) and 12 features

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18248 entries, 0 to 18247
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             18248 non-null  object
 1   story_id        18248 non-null  object
 2   soup            18248 non-null  object
 3   title           18248 non-null  object
 4   author          18246 non-null  object
 5   contest_num     18248 non-null  object
 6   won_contest     236 non-null    object
 7   categories      18248 non-null  object
 8   num_likes       18248 non-null  int64 
 9   num_comments    18248 non-null  int64 
 10  story_html      18248 non-null  object
 11  date_published  18248 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.7+ MB


'won_contest' only has a value if the story was a winner or shortlisted

In [None]:
df.head()

Unnamed: 0,url,story_id,soup,title,author,contest_num,won_contest,categories,num_likes,num_comments,story_html,date_published
0,https://blog.reedsy.com/short-story/n1pl7n/,n1pl7n,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,11:59,Krystal Pepper,"['Contest', '#137']",,"['Crime', 'Fiction', 'Thriller']",17,3,"<article class=""font-alt submission-content sp...","March 11, 2022 17:27"
1,https://blog.reedsy.com/short-story/yv1ahb/,yv1ahb,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,Want Ads,Nina Wishnat,"['Contest', '#30']",,"['Contemporary', 'Fiction']",1,0,"<article class=""font-alt submission-content sp...","February 28, 2020 15:27"
2,https://blog.reedsy.com/short-story/kv1cr7/,kv1cr7,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,To be wrong because of sincerity...,Lis Lovén,"['Contest', '#102']",,"['Black', 'Contemporary', 'Fiction']",12,0,"<article class=""font-alt submission-content sp...","July 10, 2021 14:21"
3,https://blog.reedsy.com/short-story/v2nqtq/,v2nqtq,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,"The Shiva, 1955",Tammy Kl,"['Contest', '#100']",,"['American', 'Coming', 'of', 'Age', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 01, 2021 18:13"
4,https://blog.reedsy.com/short-story/nptt18/,nptt18,<!DOCTYPE html>\n\n<html>\n<head>\n<meta chars...,The Monster of Greentree,Best Christopher,"['Contest', '#102']",,"['Coming', 'of', 'Age', 'Adventure', 'Fiction']",6,0,"<article class=""font-alt submission-content sp...","July 17, 2021 01:23"


In [None]:
df.describe()

Unnamed: 0,num_likes,num_comments
count,18248.0,18248.0
mean,14.1387,4.295758
std,18.742977,17.676338
min,0.0,0.0
25%,7.0,0.0
50%,10.0,1.0
75%,14.0,3.0
max,503.0,629.0




<code>'num_likes'</code> is the target feature for regression problem

# Feature extraction

Features to extract:

*   num_sent (number of sentences)
*   story_sent (list of sentences)
*   sent_lengths (list of lengths of sentences)
*   sent_avg_length (average length of sentences)
*   sent_words (list of lists of sentences' words
*   story_word_toks
*   words
*   punctuation
*   num_words
*   unique_words
*   num_unique_words
*   unique_words_percent
*   age (in days)
*   words_exist_per_cent (percentage of words that exist in the Brown corpus)
*   encoded categories
*   text_str







## extracting NLP features:

In [None]:
from bs4 import BeautifulSoup
from nltk.tokenize import wordpunct_tokenize, sent_tokenize, word_tokenize
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def separate_words_and_punct(arr_tokenized_text):
    words = []
    punctuation = []
    for i, value in enumerate(arr_tokenized_text):
        if len(value) == 1 and not value.isalpha():
            punctuation.append(i)
        else:
            words.append(i)
    return arr_tokenized_text[words], arr_tokenized_text[punctuation]

In [None]:
def extract_word_feats(story_text):
    story_word_toks = word_tokenize(story_text)
    words, punctuation = separate_words_and_punct(np.array(story_word_toks, dtype='object'))
    num_words = len(words)
    unique_words = set(words)
    num_unique_words = len(unique_words)
    unique_words_percent = num_unique_words / num_words
    return story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent

In [None]:
def extract_sentence_feats(story_text):
    story_sent = sent_tokenize(story_text)
    num_sent = len(story_sent)
    words_per_sent = np.array([(len(word_tokenize(t)), word_tokenize(t)) for t in story_sent], dtype='object')
    sent_lengths = words_per_sent[:, 0].astype(int)
    sent_words = words_per_sent[:, 1]
    sent_avg_length = sent_lengths.mean()
    return story_sent, num_sent, sent_lengths, sent_avg_length, sent_words

In [None]:
def tokenize_story(row):
    # extracting story from html
    html = row['story_html']
    soup = BeautifulSoup(html, "html.parser")
    story = soup.get_text()
    # tokenizing by sentence
    story_sent, num_sent, sent_lengths, sent_avg_length, sent_words = extract_sentence_feats(story)
    # tokenizing by word
    story_word_toks, words, punctuation, num_words, unique_words, num_unique_words, unique_words_percent = extract_word_feats(story)
        
    feat_dict = {'num_sent': num_sent, 
                 'story_sent': story_sent, 
                 'sent_lengths': sent_lengths, 
                 'sent_avg_length': sent_avg_length, 
                 'sent_words': sent_words, 
                 'story_word_toks': story_word_toks, 
                 'words': words, 
                 'punctuation': punctuation, 
                 'num_words': num_words, 
                 'unique_words': unique_words, 
                 'num_unique_words': num_unique_words, 
                 'unique_words_percent': unique_words_percent}
    return pd.DataFrame.from_dict(feat_dict, orient='index').transpose()

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
new_feats = ('num_sent', 'words', 'num_words', 'unique_words', 'num_unique_words', 'unique_words_percent','punctuation')
results = df.progress_apply(tokenize_story, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
# results = df.swifter.apply(tokenize_story, axis=1)

In [None]:
feats_df = pd.DataFrame()
for sub in tqdm(results):
    feats_df = pd.concat([feats_df, sub])
feats_df = feats_df.reset_index()
feats_df.drop('index', axis=1, inplace=True)

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
feats_df.shape, df.shape

((18248, 12), (18248, 12))

12 new features extracted

In [None]:
df = pd.concat([df, feats_df], axis=1)

Extract text as string:

In [None]:
from bs4 import BeautifulSoup
def text_str(row):
  row['text_str'] = BeautifulSoup(row['story_html'], "html.parser").get_text()
  return row

df = df.progress_apply(text_str, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

## extracting age of stories (in days)

In [None]:
from datetime import date, datetime
def calculate_age(row):
  born = row['date_published']
  today = datetime.strptime('10/6/2022', "%m/%d/%Y")
  row['age'] = (today - born).days
  return row

In [None]:
df['date_published'] = pd.to_datetime(df['date_published'])

In [None]:
df = df.progress_apply(calculate_age, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

## Extracting Percent of words in English corpus Brown

In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown_corpus = brown.words()
len(brown_corpus)
brown_corpus = list(set(brown_corpus))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
def words_exist_per_cent(row):
  set1 = set([x.lower() for x in list(row['unique_words'])])
  #set1 = set(list(row['unique_words']).lower().split("', '"))
  set_brown = set(brown_corpus)
  row['words_exist_per_cent'] = \
  100*len(list(set1 & set_brown))/\
                   len(set1)
  return row

df = df.progress_apply(words_exist_per_cent, axis=1) #df['words_exist_per_cent']

  0%|          | 0/18248 [00:00<?, ?it/s]

Save the dataframe:

In [None]:
df.to_csv('drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

## Extracting and encoding categories

In [None]:
def find_num_cats(row):
    return len(row['categories'].split())

In [None]:
df['num_cats'] = df.progress_apply(find_num_cats, axis=1)

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
import re
cats = set()
for sample, num in df[['categories', 'num_cats']].values:
    samp_cats = re.findall('[A-Z][a-z]+', sample)
    cats.update(samp_cats)
print(f'we have {len(cats)} categories')

we have 49 categories


In [None]:
for cat in tqdm(cats):
    df[f'cat_{cat}'] = (df['categories']).astype('category').str.contains(cat).astype(int)

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
assert df['cat_Fiction'].sum() == len(df)

In [None]:
df.drop('categories', axis=1, inplace=True)
df.shape

(18248, 76)

In [None]:
df.to_csv('drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

## Extracting contest winners, shortlisters, and contest number

In [None]:
df['won_contest'].fillna(0, inplace=True)
df['won_contest'] = df['won_contest'].str.contains('winner', na=False).astype(int)
df['shortlisted'] = df['won_contest'].str.contains('shortlist', na=False).astype(int)

In [None]:
df['won_contest'].value_counts()

In [None]:
df['shortlisted'].value_counts()

In [None]:
df['contest_num'] = df['contest_num'].str.removeprefix("['Contest', '#").str.removesuffix("']").astype(int)

## Lemmatization

Because the RAM is almost saturated, we drop all variables, relaunch the execution, and load the dataframe.

In [None]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
df = pd.read_csv('drive/MyDrive/Final Project/story_data_new_feats.csv')

In [None]:
# turn into lemmas
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
df['text_str_lem'] = df.text_str.progress_apply(lambda x: " ".join([word.lemma_ for word in nlp(x)]))

  0%|          | 0/18248 [00:00<?, ?it/s]

In [None]:
# Split by paragraph    
if 'lemmatized_parag' not in df.columns:
  df['lemmatized_parag']  = df['text_str_lem'].str.split('\n')

# Output to csv

In [None]:
df.drop(['url', 'soup'], axis=1).to_csv('drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

In [None]:
df[['url', 'soup']].to_csv('drive/MyDrive/Final Project/story_url_and_soup.csv', index=False)

In [None]:
# other.shape

(18248, 73)

# Load sentiment and emotions

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("drive/MyDrive/Final Project/story_data_new_feats.csv")
sentiment_hf_parag_no_prepro = pd.read_csv("drive/MyDrive/Final Project/sentiment_features/sentiment_hf_parag_no_prepro.csv")
sentiment_hf_text = pd.read_csv("drive/MyDrive/Final Project/sentiment_features/sentiment_hf_text.csv")
df_sentiment_merged = pd.concat([sentiment_hf_parag_no_prepro, sentiment_hf_text], axis=1, join='inner')
df_merged = pd.concat([df, df_sentiment_merged], axis=1, join='inner')

In [16]:
df_merged.to_csv(f'drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

New features:

* sentiment_POSITIVE_no_prepro
* sentiment_NEGATIVE_no_prepro
* sentiment_hf_text

In [18]:
emotion = pd.read_csv("drive/MyDrive/Final Project/df_emotions.csv")

In [19]:
df = pd.concat([df_merged, emotion], axis=1, join='inner')

In [22]:
df.to_csv(f'drive/MyDrive/Final Project/story_data_new_feats.csv', index=False)

New features:

* emotion_all	
* emotion_anger	
* emotion_fear	
* emotion_joy	
* emotion_love	
* emotion_sadness	
* emotion_surprise