In [1]:
import pandas as pd
import numpy as np

In [2]:
data_df = pd.read_csv('bible_data_set.csv')

print(data_df)

               citation        book  chapter  verse  \
0           Genesis 1:1     Genesis        1      1   
1           Genesis 1:2     Genesis        1      2   
2           Genesis 1:3     Genesis        1      3   
3           Genesis 1:4     Genesis        1      4   
4           Genesis 1:5     Genesis        1      5   
...                 ...         ...      ...    ...   
31097  Revelation 22:17  Revelation       22     17   
31098  Revelation 22:18  Revelation       22     18   
31099  Revelation 22:19  Revelation       22     19   
31100  Revelation 22:20  Revelation       22     20   
31101  Revelation 22:21  Revelation       22     21   

                                                    text  
0      In the beginning God created the heaven and th...  
1      And the earth was without form, and void; and ...  
2      And God said, Let there be light: and there wa...  
3      And God saw the light, that it was good: and G...  
4      And God called the light Day, and the

In [3]:
#1- Looking for missing values.
print(data_df.isnull())#Looks good- no missing values.

       citation   book  chapter  verse   text
0         False  False    False  False  False
1         False  False    False  False  False
2         False  False    False  False  False
3         False  False    False  False  False
4         False  False    False  False  False
...         ...    ...      ...    ...    ...
31097     False  False    False  False  False
31098     False  False    False  False  False
31099     False  False    False  False  False
31100     False  False    False  False  False
31101     False  False    False  False  False

[31102 rows x 5 columns]


In [4]:
#1.b - To be sure missing values aren't where we can't see them
print(data_df.isnull().sum())#Looks good!

citation    0
book        0
chapter     0
verse       0
text        0
dtype: int64


In [5]:
#2 - Identify duplicate records.
print(data_df.duplicated())#Looks good, but maybe they are duplicated where we can't see them.

0        False
1        False
2        False
3        False
4        False
         ...  
31097    False
31098    False
31099    False
31100    False
31101    False
Length: 31102, dtype: bool


In [6]:
#2.b - Count duplicate records
print(data_df.duplicated().sum)#Looks good- no duplicates.

<bound method NDFrame._add_numeric_operations.<locals>.sum of 0        False
1        False
2        False
3        False
4        False
         ...  
31097    False
31098    False
31099    False
31100    False
31101    False
Length: 31102, dtype: bool>


In [7]:
#3 - Trim whitespace.
data_df['text'] = data_df['text'].str.strip()

In [8]:
print(data_df)

               citation        book  chapter  verse  \
0           Genesis 1:1     Genesis        1      1   
1           Genesis 1:2     Genesis        1      2   
2           Genesis 1:3     Genesis        1      3   
3           Genesis 1:4     Genesis        1      4   
4           Genesis 1:5     Genesis        1      5   
...                 ...         ...      ...    ...   
31097  Revelation 22:17  Revelation       22     17   
31098  Revelation 22:18  Revelation       22     18   
31099  Revelation 22:19  Revelation       22     19   
31100  Revelation 22:20  Revelation       22     20   
31101  Revelation 22:21  Revelation       22     21   

                                                    text  
0      In the beginning God created the heaven and th...  
1      And the earth was without form, and void; and ...  
2      And God said, Let there be light: and there wa...  
3      And God saw the light, that it was good: and G...  
4      And God called the light Day, and the

In [9]:
data_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,Genesis,1,1,In the beginning God created the heaven and th...
1,Genesis 1:2,Genesis,1,2,"And the earth was without form, and void; and ..."
2,Genesis 1:3,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,Genesis 1:4,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,Genesis 1:5,Genesis,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...,...
31097,Revelation 22:17,Revelation,22,17,"And the Spirit and the bride say, Come. And le..."
31098,Revelation 22:18,Revelation,22,18,For I testify unto every man that heareth the ...
31099,Revelation 22:19,Revelation,22,19,And if any man shall take away from the words ...
31100,Revelation 22:20,Revelation,22,20,"He which testifieth these things saith, Surely..."


In [10]:
#4 - Column citation contains the book chapter and verse but is repeated in other columns and separated.  Delete the column citation.
del data_df['citation']

In [11]:
data_df#View the dataframe to review.

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,In the beginning God created the heaven and th...
1,Genesis,1,2,"And the earth was without form, and void; and ..."
2,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,Genesis,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...
31097,Revelation,22,17,"And the Spirit and the bride say, Come. And le..."
31098,Revelation,22,18,For I testify unto every man that heareth the ...
31099,Revelation,22,19,And if any man shall take away from the words ...
31100,Revelation,22,20,"He which testifieth these things saith, Surely..."


In [12]:
#5 - Rename my columns with capitals.
data_df.rename(columns = {'book':'Book', 'chapter':'Chapters', 'verse':'Verse', 'text':'Text'}, inplace = True)
print(data_df.columns)

Index(['Book', 'Chapters', 'Verse', 'Text'], dtype='object')


In [13]:
data_df

Unnamed: 0,Book,Chapters,Verse,Text
0,Genesis,1,1,In the beginning God created the heaven and th...
1,Genesis,1,2,"And the earth was without form, and void; and ..."
2,Genesis,1,3,"And God said, Let there be light: and there wa..."
3,Genesis,1,4,"And God saw the light, that it was good: and G..."
4,Genesis,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...
31097,Revelation,22,17,"And the Spirit and the bride say, Come. And le..."
31098,Revelation,22,18,For I testify unto every man that heareth the ...
31099,Revelation,22,19,And if any man shall take away from the words ...
31100,Revelation,22,20,"He which testifieth these things saith, Surely..."


In [None]:
#The ethical implications of changing this dataset could occur when the
#person isn't familiar with the data and inadvertantly biases the data by removing
#columns or changing the data- like removing quotes where they may have
#a specific meaning.  I don't think there are any ethical considerations specifically
#with what I modified but the potential always exists when manipulating data.