In [74]:
import pandas as pd
import datetime
import sqlite3

# NYC Dog Licensing Dataset (ETL): Extract

In [75]:
#pull and save NYC Dog Licensing Dataset from NYC Open Data
dogs = pd.read_csv("NYC_Dog_Licensing_Dataset.csv")
print(dogs.shape)
dogs.head()

(121949, 15)


Unnamed: 0,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,533,BONITA,F,05/01/2013 12:00:00 AM,Unknown,Queens,11435.0,412.0,208.0,QN61,28.0,5.0,10.0,10/24/2014,11/15/2017
1,548,ROCKY,M,05/01/2014 12:00:00 AM,Labrador Retriever Crossbreed,Queens,11691.0,414.0,100801.0,QN15,31.0,5.0,10.0,10/25/2014,10/25/2019
2,622,BULLY,M,07/01/2010 12:00:00 AM,American Pit Bull Terrier/Pit Bull,Queens,11419.0,410.0,98.0,QN55,28.0,5.0,10.0,10/28/2014,09/24/2016
3,633,COCO,M,02/01/2005 12:00:00 AM,Labrador Retriever,Queens,11692.0,414.0,964.0,QN12,31.0,5.0,10.0,10/29/2014,10/29/2017
4,655,SKI,F,09/01/2012 12:00:00 AM,American Pit Bull Terrier/Pit Bull,Queens,11691.0,414.0,100802.0,QN15,31.0,5.0,10.0,10/31/2014,10/31/2019


# NYC Dog Licensing Dataset (ETL): Transform

In [76]:
#drop extraneous columns of dataset
dogs = dogs.drop(['AnimalBirthMonth',
                  'CommunityDistrict',
                  'CensusTract2010',
                  'NTA',
                  'CityCouncilDistrict',
                  'CongressionalDistrict',
                  'StateSenatorialDistrict'], axis=1)

In [77]:
#drop records with missing data
dogs = dogs.dropna()
dogs.shape

(121713, 8)

In [78]:
#set integer types
dogs.RowNumber = dogs.RowNumber.astype(int)
dogs.ZipCode = dogs.ZipCode.astype(int)

#convert dates to datetime objects
dogs.LicenseIssuedDate = pd.to_datetime(dogs.LicenseIssuedDate)
print(dogs.LicenseIssuedDate.head())

dogs.LicenseExpiredDate = pd.to_datetime(dogs.LicenseExpiredDate)
print(dogs.LicenseExpiredDate.head())

0   2014-10-24
1   2014-10-25
2   2014-10-28
3   2014-10-29
4   2014-10-31
Name: LicenseIssuedDate, dtype: datetime64[ns]
0   2017-11-15
1   2019-10-25
2   2016-09-24
3   2017-10-29
4   2019-10-31
Name: LicenseExpiredDate, dtype: datetime64[ns]


In [79]:
#check data types of dataset
dogs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121713 entries, 0 to 121948
Data columns (total 8 columns):
RowNumber             121713 non-null int64
AnimalName            121713 non-null object
AnimalGender          121713 non-null object
BreedName             121713 non-null object
Borough               121713 non-null object
ZipCode               121713 non-null int64
LicenseIssuedDate     121713 non-null datetime64[ns]
LicenseExpiredDate    121713 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(4)
memory usage: 8.4+ MB


In [80]:
#review cleaned dataset
dogs.head()

Unnamed: 0,RowNumber,AnimalName,AnimalGender,BreedName,Borough,ZipCode,LicenseIssuedDate,LicenseExpiredDate
0,533,BONITA,F,Unknown,Queens,11435,2014-10-24,2017-11-15
1,548,ROCKY,M,Labrador Retriever Crossbreed,Queens,11691,2014-10-25,2019-10-25
2,622,BULLY,M,American Pit Bull Terrier/Pit Bull,Queens,11419,2014-10-28,2016-09-24
3,633,COCO,M,Labrador Retriever,Queens,11692,2014-10-29,2017-10-29
4,655,SKI,F,American Pit Bull Terrier/Pit Bull,Queens,11691,2014-10-31,2019-10-31


# NYC Dog Licensing Dataset (ETL): Load

In [81]:
#creating SQL connection
conn = sqlite3.connect('pet_care_industry.db')
c = conn.cursor()

#function to create table
def create_table(query):
    c.execute(query)

#function to close connection
def close_c_conn():
    c.close()
    conn.close()

In [82]:
#create dogs table
create_query = """CREATE TABLE IF NOT EXISTS dogs
                (RowNumber INTEGER PRIMARY KEY,
                 AnimalName TEXT,
                 AnimalGender TEXT,
                 BreedName TEXT,
                 Borough TEXT,
                 ZipCode INTEGER,
                 LicenseIssuedDate TEXT,
                 LicenseExpiredDate TEXT);"""

create_table(create_query)

In [88]:
#function to insert dogs into table
def insert_dogs(dogs):
    for dog in dogs:
        c.execute("""INSERT INTO dogs
                  (RowNumber,
                   AnimalName,
                   AnimalGender,
                   BreedName,
                   Borough,
                   ZipCode,
                   LicenseIssuedDate,
                   LicenseExpiredDate)
                   VALUES
                   (?,?,?,?,?,?,?,?);""",
                   (dog['RowNumber'],
                    dog['AnimalName'],
                    dog['AnimalGender'],
                    dog['BreedName'],
                    dog['Borough'],
                    dog['ZipCode'],
                    dog['LicenseIssuedDate'],
                    dog['LicenseExpiredDate']))
        
    conn.commit()
    
#insert dogs into table
insert_dogs(dogs)
    
pd.read_sql_query("SELECT * FROM dogs LIMIT 5;", conn)

TypeError: string indices must be integers

In [60]:
#close connection
close_c_conn()