In [84]:
import os
import csv
import pandas

In [85]:
%pwd

u'/Users/brian/code/bible/jupyter'

In [101]:
DATA_DIR = "/Users/brian/code/bible/data"
OUTPUT_DIR = "/Users/brian/code/bible/output"
TEXT_PATH = os.path.join(DATA_DIR, "bible-corpus/t_kjv.csv")
GENRE_PATH = os.path.join(DATA_DIR, "bible-corpus/key_genre_english.csv")
BOOK_PATH = os.path.join(DATA_DIR, "bible-corpus/key_english.csv")
OUT_DF_PATH = os.path.join(OUTPUT_DIR, "books_summary.csv")

In [87]:
genre_df = pandas.read_csv(GENRE_PATH)
genre_dict = dict(zip(genre_df["g"], genre_df["n"]))

In [88]:
genre_dict

{1: 'Law',
 2: 'History',
 3: 'Wisdom',
 4: 'Prophets',
 5: 'Gospels',
 6: 'Acts',
 7: 'Epistles',
 8: 'Apocalyptic'}

In [89]:
book_df = pandas.read_csv(BOOK_PATH)
book_df.rename(index=str, columns={"n": "name", "t": "testament"}, inplace=True)
book_df["genre"] = [genre_dict[k] for k in book_df["g"]]

book_dict = dict(zip(book_df["b"], book_df["name"]))
book_genre_dict = dict(zip(book_df["name"], book_df["g"]))

In [90]:
book_df

Unnamed: 0,b,name,testament,g,genre
0,1,Genesis,OT,1,Law
1,2,Exodus,OT,1,Law
2,3,Leviticus,OT,1,Law
3,4,Numbers,OT,1,Law
4,5,Deuteronomy,OT,1,Law
5,6,Joshua,OT,2,History
6,7,Judges,OT,2,History
7,8,Ruth,OT,2,History
8,9,1 Samuel,OT,2,History
9,10,2 Samuel,OT,2,History


In [91]:
text_all = pandas.read_csv(TEXT_PATH)

In [92]:
text_all["words"] = [len(verse.split(" ")) for verse in text_all["t"]]

In [93]:
text_all.head()

Unnamed: 0,id,b,c,v,t,words
0,1001001,1,1,1,In the beginning God created the heaven and th...,10
1,1001002,1,1,2,"And the earth was without form, and void; and ...",29
2,1001003,1,1,3,"And God said, Let there be light: and there wa...",11
3,1001004,1,1,4,"And God saw the light, that it was good: and G...",17
4,1001005,1,1,5,"And God called the light Day, and the darkness...",22


In [94]:
book_df["chapters"] = text_all.groupby("b")["c"].nunique().tolist()
book_df["verses"] = text_all.groupby("b").size().tolist()
book_df["words_kjv"] = text_all.groupby("b")["words"].sum().tolist()

In [95]:
book_df

Unnamed: 0,b,name,testament,g,genre,chapters,verses,words_kjv
0,1,Genesis,OT,1,Law,50,1533,38265
1,2,Exodus,OT,1,Law,40,1213,32684
2,3,Leviticus,OT,1,Law,27,859,24543
3,4,Numbers,OT,1,Law,36,1288,32895
4,5,Deuteronomy,OT,1,Law,34,959,28352
5,6,Joshua,OT,2,History,24,658,18852
6,7,Judges,OT,2,History,21,618,18966
7,8,Ruth,OT,2,History,4,85,2574
8,9,1 Samuel,OT,2,History,31,810,25047
9,10,2 Samuel,OT,2,History,24,695,20599


In [100]:
print "The KJV contains", sum(book_df["chapters"]), "chapters,", \
    sum(book_df["verses"]), "verses, and", \
    sum(book_df["words_kjv"]), "words"

The KJV contains 1189 chapters, 31103 verses, and 789635 words


In [104]:
book_df.to_csv(OUT_DF_PATH, index=False)

## Visualization time

In [108]:
import plotly
plotly.__version__

'3.3.0'

In [114]:
import plotly.plotly as py
import plotly.graph_objs as go

In [165]:
criteria = ["chapters", "verses", "words_kjv"]
pretty_criteria = ["chapters", "verses", "words (KJV)"]
colors = ["red", "green", "blue"]
height_px = 800

In [166]:
index = 0
criterion = criteria[index]
pretty_criterion = pretty_criteria[index]
book_indices = book_df[criterion].argsort()

trace1 = {"x": book_df[criterion][book_indices],
          "y": book_df["name"][book_indices],
          "marker": {"color": colors[index], "size": 12},
          "mode": "markers",
          "name": pretty_criterion,
          "type": "scatter"
}

data = [trace1]
layout = {"title": "Bible Books by Length",
          "xaxis": {"title": "Number of " + pretty_criterion},
          "height": height_px
         }

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bible_dot_plot_' + criterion)

In [167]:
index = 1
criterion = criteria[index]
pretty_criterion = pretty_criteria[index]
book_indices = book_df[criterion].argsort()

trace1 = {"x": book_df[criterion][book_indices],
          "y": book_df["name"][book_indices],
          "marker": {"color": colors[index], "size": 12},
          "mode": "markers",
          "name": pretty_criterion,
          "type": "scatter"
}

data = [trace1]
layout = {"title": "Bible Books by Length",
          "xaxis": {"title": "Number of " + pretty_criterion},
          "height": height_px
         }

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bible_dot_plot_' + criterion)

In [168]:
index = 2
criterion = criteria[index]
pretty_criterion = pretty_criteria[index]
book_indices = book_df[criterion].argsort()

trace1 = {"x": book_df[criterion][book_indices],
          "y": book_df["name"][book_indices],
          "marker": {"color": colors[index], "size": 12},
          "mode": "markers",
          "name": pretty_criterion,
          "type": "scatter"
}

data = [trace1]
layout = {"title": "Bible Books by Length",
          "xaxis": {"title": "Number of " + pretty_criterion},
          "height": height_px
         }

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bible_dot_plot_' + criterion)