# Create the data

### 1.  Import packages and create global variables

In [196]:
import os
import re
import nltk
import textract
from textblob import TextBlob, Word
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer

In [197]:
path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/'
epub_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_epub_working/"
txt_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_txt/"
test_path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/test_hold_out/'
folders = ['sci-fi_top','sci-fi_flop','romance_top','romance_flop']
folders2 = ['other']
check_words = ['acknowledgements','table of contents','about the author', 'appendix', 
               'copyright','isbn','by this author', 'chapter']

### 2. Load profanity file

In [198]:
curses = pd.read_csv(path + 'other/profanity.csv')
curses.drop('Unnamed: 1', inplace=True, axis=1)
bad_words = curses.word.T.tolist()

### 3. Extract text from epub files

#### Text file creation functions

In [199]:
# loop through files in directory, convert file, save file in new folder
def create_text_files(epub_path,txt_path):
    for epub in os.listdir(epub_path):
        try:
            convert_epub_to_text(epub_path, epub, txt_path)
        except:
            print epub, "failed"
            
# function to extract text from epub
def convert_epub_to_text(epub_path, epub_file, txt_path):
    clean_text = ''
    text_name = epub_file.replace(' ','_')[:-4]+'txt' #clean up filename and change file extention
    
    text = textract.process(epub_path+epub_file,encoding='utf_8') #extract text from epub
    clean_text = text.decode('ascii', 'ignore').replace('\n',' ') #trip out the unicode and return characters

    text_file = open(txt_path+text_name, 'w') #save as text file
    text_file.write(clean_text)
    text_file.close()
    


#### Create the text files

In [200]:
create_text_files(epub_path,txt_path)

#### Split each file into 10 sub-chunk files

In [210]:
def split_text_files(epub_path,txt_path):
    for txt_file in os.listdir(epub_path):
        if txt_file[:1]!='.':
            try:
                split(epub_path, txt_file, txt_path)
            except:
                print txt_file, "failed"

def split(in_path, txt_file, out_path):
    tokenizer = RegexpTokenizer(r'\w+')
    sent_detect = PunktSentenceTokenizer()
    text  = open(in_path + txt_file, 'r').read()
    utext = text.decode('ascii', 'ignore').replace('\n',' ').replace('\r','').replace('-',' ').replace('.','. ')
    sentences = sent_detect.sentences_from_text(utext)
    
    chunksize = len(sentences)/10
    fid = 1

    with open(in_path + txt_file) as infile:
        f = open(out_path + '%s%d.txt' % (txt_file[:-4],fid), 'w')
        for i,line in enumerate(sentences):
            f.write(line)
            if i%chunksize==0 and i!=0 and fid!=10:
                f.close()
                fid += 1
                f = open(out_path + '%s%d.txt' % (txt_file[:-4],fid), 'w')
        f.close()

In [218]:
split_text_files(epub_path,txt_path)

### 4. Create the dataframes for the training and testing data

In [201]:
# master function that calls functions below
def create_data(path,folders,df):
    df_name = df
    df = create_df_from_files(path, folders) # create df and initial binary indicators
    create_metrics(df)                       # create metric columns (this runs forever)
    df.to_csv(path + df_name + '.csv')       # saves df as csv so we don't have to do the above again
    return df

In [213]:
# create training data  - note: this takes many moons to run
df = create_data(path,folders,'df')

In [214]:
# create testing data
df_test = create_data(test_path,folders,'df_test')

In [192]:
# create hold-out data
df_other = create_data(test_path,folders2,'df_other')

In [None]:
# check for front and back matter
validate_content(df,check_words)

#### Functions that perform the above magic


In [212]:
# load txt files into dataframe, 
# give each entry a best_selling 1/0 entry and a sci_fi 1/0 (0=romance) indicator
def create_df_from_files(path, folders):
    df = pd.DataFrame()

    for folder in folders:
        if folder[-3:]=='top':
            bs = 1
        else:
            bs = 0
        if folder[:3]=='sci':
            sf = 1
        else:
            sf = 0

        for text_file in os.listdir(path+folder+'/'):
            full_path = path + folder + '/' + text_file
            if text_file.endswith((".txt")):
                text  = open(full_path, 'r').read()
                temp = pd.DataFrame({
                        'best_seller': bs,
                        'sci_fi': sf,
                        'title': text_file[:-4].replace('_',' ').replace('-',' - '),
                        'body': text.decode('ascii', 'ignore').replace('\n',' ').replace('\r','').replace('-',' ').replace('?','? ').replace('.','. ')}, 
                                    index=[0])
                df = pd.concat([df, temp])

    df = df.reset_index() # because index=[0]
    del df['index']
    return df

# check for front and back matter in body
# remaining issues are intentional usage in the body
def validate_content(df, check_words):
    for i in range(0,len(df)):
        for word in check_words:
            if word in df.iloc[i,1].lower():
                print df.ix[i,3], ' : ', word 

In [177]:
# function bank for creating metrics

def avg_sentence_len(text):
    word_counts = []
    tokenizer = RegexpTokenizer(r'\w+')
    sent_detect = PunktSentenceTokenizer()
    sentences = sent_detect.sentences_from_text(text)
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        word_counts.append(len(words))
    avg_word_count = sum(word_counts)/len(word_counts)  
    return avg_word_count

#--------------------------------

def get_token_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return words

def word_count(text):
    words = get_token_words(text)
    return len(words)

def avg_word_len(text):
    letter_counts = []
    words = get_token_words(text)
    for word in words:
        letter_counts.append(len(word))
    avg_word_len = sum(letter_counts)/len(letter_counts)
    return avg_word_len

def profanity_counter(text):
    i=0
    words = get_token_words(text)
    for word in words:
        if word in bad_words:
            i+=1       
    return i

def lex_div(text):
    words = get_token_words(text)
    lexical_diversity = 1.0 * len(set(words)) / len(words)
    return lexical_diversity

#--------------------------------

def to_blob(text):
    blob = TextBlob(text)
    return blob

def assign_polarity(text):
    blob = to_blob(text)
    return blob.sentiment.polarity

def assign_subjectivity(text):
    blob = to_blob(text)
    return blob.sentiment.subjectivity

#---------------------------------

def parse_pos(df,field):
    for i in range(0,len(df)):
        blob = TextBlob(df.ix[i,field])
        tags = blob.tags
        df_tags = pd.DataFrame(tags)
        df_tags = df_tags.groupby([1]).count().reset_index() # create a count of POS tags 

        verbs=[] # create verb-only list
        for word,pos in tags:
            if pos[:2] == 'VB':
                try:
                    verbs.append((word, wordnet.synsets(word,'v')[0].lexname())) # get lexical name for each verb
                except:
                    continue
       
        df_verbs = pd.DataFrame(verbs)
        df_verbs = df_verbs.groupby([1]).count().reset_index() # create a count of verb subtype tags
                
        df_tags = pd.concat([df_tags,df_verbs], axis=0).reset_index()            # concat the pos df and the verb df
        del df_tags['index']

        df.ix[i,'verb_count'] = len(verbs)
        for x in range(0,len(df_tags)):                            # add these as new columns to df           
                df.ix[i, df_tags.ix[x,1] ] = df_tags.ix[x,0]   
        df.fillna(0,inplace=True)

            
#-----------------------------------
   
def normalize_pos(df):
    for row in range(0,len(df)):
        for col in range(11,len(df.columns)):
            if df.columns[col][:5]=='verb.':        # if one of the verb categories div by verb count
                df.iloc[row, col] = df.iloc[row,col]/df.ix[row,'verb_count']
            else:
                if df.columns[col][:5]!='verb_':    # if not div by total word count
                    df.iloc[row, col] = df.iloc[row,col]/df.ix[row,'word_count']
            
#-----------------------------------

def clean_more(text):
    return text.replace('.','. ').replace('`',' ').replace('*','')

#### Create new columns of metrics and rename columns

In [204]:
def create_metrics(df):
    df['body'] = df['body'].map(clean_more)
    df['avg_sent_len'] = df['body'].map(avg_sentence_len)
    df['word_count'] = df['body'].map(word_count)
    df['avg_word_len'] = df['body'].map(avg_word_len)
    df['lex_diversity'] = df['body'].map(lex_div)
    df['polarity'] = df['body'].map(assign_polarity)
    df['subjectivity'] = df['body'].map(assign_subjectivity)
    df['profanity'] = df['body'].map(profanity_counter)
    df['profane'] = 1. * df['profanity']/df['word_count']
    parse_pos(df,'body')
    normalize_pos(df)
    
    df.rename(columns={'CC':'conj_coord', 'CD':'number', 'DT':'determiner', 'EX':'exist_there',
                  'FW':'foreign_word','IN':'conj_sub_prep','JJ':'adj','JJR':'adj_compare',
                 'JJS':'adj_sup','MD':'verb_aux',  'NN':'noun','NNP':'noun_prop',
                'NNPS':'noun_prop_pural',  'NNS':'noun_plural', 'PDT':'predeterm','PRP':'pronoun_pers',
                'PRP$':'pronoun_poss',  'RB':'adv','RBR':'adv_compare','RBS':'adv_sup',
                  'RP':'adv_part', 'TO':'inf_to',  'UH':'interject','VB':'verb_base',
                 'VBD':'verb_past','VBG':'verb_ger','VBN':'verb_pp','VBP':'verb_sing_pres',
                 'VBZ':'verb_3rd_sing_pres','WDT':'wh_determ','WP':'wh_pronoun','WP$':'wh_poss',
                 'WRB':'wh_adv','POS':'poss_ending','SYM':'symbol','LS':'list_marker'}, inplace=True)

### 5. Double check it

In [215]:
df.head(2)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity,polarity,subjectivity,...,verb.emotion,verb.motion,verb.perception,verb.possession,verb.social,verb.stative,verb.weather,wh_poss,poss_ending,list_marker
0,1,Prologue The sun is always just about to ris...,1,2312 - kim stanley robinson1,13,14559,4,0.223092,0.089476,0.486843,...,0.031551,0.133513,0.065025,0.089265,0.086572,0.239708,0.0,0.0,0.0,0.0
1,1,"That wayis this what love was, this desire for...",1,2312 - kim stanley robinson10,18,18721,4,0.204102,0.101801,0.465005,...,0.031855,0.129503,0.054183,0.115808,0.093778,0.252754,0.000893,0.0,0.0,0.0


In [216]:
df_test.head(2)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity,polarity,subjectivity,...,verb.creation,verb.emotion,verb.motion,verb.perception,verb.possession,verb.social,verb.stative,verb.weather,poss_ending,list_marker
0,1,Prologue: Mei Mei?Miss Carrie said. Please ...,1,caliban war - james corey1,12,18326,4,0.204245,0.016294,0.449777,...,0.013251,0.020024,0.149882,0.058893,0.09629,0.0851,0.201708,0.000883,0.0,0.0
1,1,There was no way to know what would be on the ...,1,caliban war - james corey10,11,16870,4,0.189034,0.022211,0.436823,...,0.00869,0.022176,0.158825,0.059335,0.102787,0.086605,0.194486,0.001498,0.0,0.0


In [190]:
df_other.head(2)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity,polarity,subjectivity,...,verb.creation,verb.emotion,verb.motion,verb.perception,verb.possession,verb.social,verb.stative,verb.weather,poss_ending,wh_poss
0,0,Flirtation lasts the brief flutter of a butter...,0,aquarian awakenings - lisa shea,11,29514,4,0.152504,0.090507,0.464572,...,0.010716,0.021431,0.169815,0.070287,0.102979,0.078097,0.210498,0.006357,0.0,0.0
1,0,She got of the plane and the wall of damp w...,0,NaNoWriMojo - some chump,10,47843,4,0.134502,0.078923,0.498603,...,0.01119,0.050776,0.119286,0.055104,0.116542,0.097857,0.258841,0.001583,0.000314,8.4e-05
