# Create the data

### 1.  Import packages and create global variables

In [107]:
import os
import re
import nltk
import textract
from textblob import TextBlob, Word
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer

In [108]:
path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/'
epub_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_epub_working/"
txt_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_txt/"
test_path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/test_hold_out/'
folders = ['sci-fi_top','sci-fi_flop','romance_top','romance_flop']
folders2 = ['romance_flop']
check_words = ['acknowledgements','table of contents','about the author', 'appendix', 
               'copyright','isbn','by this author', 'chapter']

### 2. Load profanity file

In [109]:
curses = pd.read_csv(path + 'other/profanity.csv')
curses.drop('Unnamed: 1', inplace=True, axis=1)
bad_words = curses.word.T.tolist()

### 3. Extract text from epub files

#### Text file creation functions

In [45]:
# loop through files in directory, convert file, save file in new folder
def create_text_files(epub_path,txt_path):
    for epub in os.listdir(epub_path):
        try:
            convert_epub_to_text(epub_path, epub, txt_path)
        except:
            print epub, "failed"
            
# function to extract text from epub
def convert_epub_to_text(epub_path, epub_file, txt_path):
    clean_text = ''
    text_name = epub_file.replace(' ','_')[:-4]+'txt' #clean up filename and change file extention
    
    text = textract.process(epub_path+epub_file,encoding='utf_8') #extract text from epub
    clean_text = text.decode('ascii', 'ignore').replace('\n',' ') #trip out the unicode and return characters

    text_file = open(txt_path+text_name, 'w') #save as text file
    text_file.write(clean_text)
    text_file.close()
    


#### Create the text files

In [None]:
create_text_files(epub_path,txt_path)

### 4. Create the dataframes for the training and testing data

In [110]:
# master function that calls functions below
def create_data(path,folders,df):
    df_name = df
    df = create_df_from_files(path, folders) # create df and initial binary indicators
    create_metrics(df)                       # create metric columns (this runs forever)
    df.to_csv(path + df_name + '.csv')       # saves df as csv so we don't have to do the above again
    return df

In [114]:
# create training data  - note: this takes many moons to run
df = create_data(path,folders,'df')

In [111]:
# create testing data
df_test = create_data(test_path,folders,'df_test')

In [None]:
# check for front and back matter
validate_content(df,check_words)

#### Functions that perform the above magic


In [77]:
# load txt files into dataframe, 
# give each entry a best_selling 1/0 entry and a sci_fi 1/0 (0=romance) indicator
def create_df_from_files(path, folders):
    df = pd.DataFrame()

    for folder in folders:
        if folder[-3:]=='top':
            bs = 1
        else:
            bs = 0
        if folder[:3]=='sci':
            sf = 1
        else:
            sf = 0

        for text_file in os.listdir(path+folder+'/'):
            full_path = path + folder + '/' + text_file
            if text_file.endswith((".txt")):
                text  = open(full_path, 'r').read()
                temp = pd.DataFrame({
                        'best_seller': bs,
                        'sci_fi': sf,
                        'title': text_file[:-4].replace('_',' ').replace('-',' - '),
                        'body': text.decode('ascii', 'ignore').replace('\n',' ').replace('\r','')}, 
                                    index=[0])
                df = pd.concat([df, temp])

    df = df.reset_index() # because index=[0]
    del df['index']
    return df

# check for front and back matter in body
# remaining issues are intentional usage in the body
def validate_content(df, check_words):
    for i in range(0,len(df)):
        for word in check_words:
            if word in df.iloc[i,1].lower():
                print df.ix[i,3], ' : ', word 

In [106]:
# function bank for creating metrics

def avg_sentence_len(text):
    word_counts = []
    tokenizer = RegexpTokenizer(r'\w+')
    sent_detect = PunktSentenceTokenizer()
    sentences = sent_detect.sentences_from_text(text)
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        word_counts.append(len(words))
    avg_word_count = sum(word_counts)/len(word_counts)  
    return avg_word_count

#--------------------------------

def get_token_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return words

def word_count(text):
    words = get_token_words(text)
    return len(words)

def avg_word_len(text):
    letter_counts = []
    words = get_token_words(text)
    for word in words:
        letter_counts.append(len(word))
    avg_word_len = sum(letter_counts)/len(letter_counts)
    return avg_word_len

def profanity_counter(text):
    i=0
    words = get_token_words(text)
    for word in words:
        if word in bad_words:
            i+=1       
    return i

def lex_div(text):
    words = get_token_words(text)
    lexical_diversity = 1.0 * len(set(words)) / len(words)
    return lexical_diversity

#--------------------------------

def to_blob(text):
    blob = TextBlob(text)
    return blob

def assign_polarity(text):
    blob = to_blob(text)
    return blob.sentiment.polarity

def assign_subjectivity(text):
    blob = to_blob(text)
    return blob.sentiment.subjectivity

#---------------------------------

def parse_pos(df,field):
    for i in range(0,len(df)):
        blob = TextBlob(df.ix[i,field])
        tags = blob.tags
        df_tags = pd.DataFrame(tags)
        df_tags = df_tags.groupby([1]).count().reset_index()
        for x in range(0,len(df_tags)):
                df.ix[i, df_tags.ix[x,1] ] = df_tags.ix[x,0]
        df.fillna(0,inplace=True)
        
def normalize_pos(df):
    for row in range(0,len(df)):
        for col in range(11,len(df.columns)):
            df.iloc[row, col] = df.iloc[row,col]/df.ix[row,'word_count']
            
#-----------------------------------

def clean_more(text):
    return text.replace('.','. ').replace('`',' ').replace('*','')

#### Create new columns of metrics and rename columns

In [21]:
def create_metrics(df):
    df['body'] = df['body'].map(clean_more)
    df['avg_sent_len'] = df['body'].map(avg_sentence_len)
    df['word_count'] = df['body'].map(word_count)
    df['avg_word_len'] = df['body'].map(avg_word_len)
    df['lex_diversity'] = df['body'].map(lex_div)
    df['polarity'] = df['body'].map(assign_polarity)
    df['subjectivity'] = df['body'].map(assign_subjectivity)
    df['profanity'] = df['body'].map(profanity_counter)
    df['profane'] = 1. * df['profanity']/df['word_count']
    parse_pos(df,'body')
    normalize_pos(df)
    
    df.rename(columns={'CC':'conj_coord', 'CD':'number', 'DT':'determiner', 'EX':'exist_there',
                  'FW':'foreign_word','IN':'conj_sub_prep','JJ':'adj','JJR':'adj_compare',
                 'JJS':'adj_sup','MD':'verb_aux',  'NN':'noun','NNP':'noun_prop',
                'NNPS':'noun_prop_pural',  'NNS':'noun_plural', 'PDT':'predeterm','PRP':'pronoun_pers',
                'PRP$':'pronoun_poss',  'RB':'adv','RBR':'adv_compare','RBS':'adv_sup',
                  'RP':'adv_part', 'TO':'inf_to',  'UH':'interject','VB':'verb_base',
                 'VBD':'verb_past','VBG':'verb_ger','VBN':'verb_pp','VBP':'verb_sing_pres',
                 'VBZ':'verb_3rd_sing_pres','WDT':'wh_determ','WP':'wh_pronoun','WP$':'wh_poss',
                 'WRB':'wh_adv','POS':'poss_ending','SYM':'symbol','LS':'list_marker'}, inplace=True)

### 5. Double check it

In [115]:
df.head(1)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity,polarity,subjectivity,...,verb_pp,verb_sing_pres,verb_3rd_sing_pres,wh_determ,wh_pronoun,wh_poss,wh_adv,poss_ending,symbol,list_marker
0,1,Prologue The sun is always just about to ris...,1,2312 - kim stanley robinson,14,166265,4,0.090717,0.076,0.465868,...,3916.0,3230.0,1780.0,703.0,779.0,4.0,922.0,0.0,0.0,0.0


In [113]:
df_test.head(1)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity,polarity,subjectivity,...,verb_ger,verb_pp,verb_sing_pres,verb_3rd_sing_pres,wh_determ,wh_pronoun,wh_poss,wh_adv,symbol,list_marker
0,1,Prologue: Mei Mei? Miss Carrie said. Please ...,1,caliban war - james corey,10,172918,4,0.069813,0.0316,0.453752,...,4970.0,3551.0,3233.0,1737.0,576.0,781.0,16.0,1055.0,0.0,0.0
