In [63]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [1 InRelease 14.2 kB/88.7                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelea

In [64]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-01-08 17:44:38--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.2’


2022-01-08 17:44:38 (6.44 MB/s) - ‘postgresql-42.2.16.jar.2’ saved [1002883/1002883]



In [65]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Animal_Shelter_Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

### Load Amazon Data into Spark DataFrame

In [66]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://lexierotto.s3.us-east-2.amazonaws.com/Breed_Info2.csv"
spark.sparkContext.addFile(url)
dog_breed_df = spark.read.csv(SparkFiles.get("Breed_Info2.csv"), sep=",", header=True, inferSchema=True)
dog_breed_df.show()

+--------------------+------+------------------------+
|               Breed|  Size|Average_life_expectancy |
+--------------------+------+------------------------+
|       Pit Bull Mix | large|                      11|
|Chihuahua Shortha...| small|                      15|
|Labrador Retrieve...|large |                      13|
|  German Sheperd Mix| large|                      10|
|Australian Cattle...|medium|                      13|
|       Dachshund Mix| small|                      14|
|          Boxer Mix | large|                       9|
|Miniature Poodle ...| small|                      14|
|   Border Collie Mix|medium|                      12|
|      Catahoula Mix | large|                      12|
|    Rat Terrier Mix | small|                      16|
|Australian Shephe...|medium|                      13|
|Yorkshire Terrier...| small|                      14|
|  Siberian Husky Mix|medium|                      13|
|Jack Russell Terr...| small|                      15|
+---------

In [67]:
#read in the uncleaned shelter info 
from pyspark import SparkFiles
url = "https://lexierotto.s3.us-east-2.amazonaws.com/aac_shelter_outcomes.csv"
spark.sparkContext.addFile(url)
uncleaned_shelter_df = spark.read.csv(SparkFiles.get("aac_shelter_outcomes.csv"), sep=",", header=True, inferSchema=True)
uncleaned_shelter_df.show()

+----------------+---------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-----------+---------------+---------------+----------------+
|age_upon_outcome|animal_id|animal_type|               breed|              color|      date_of_birth|           datetime|          monthyear|       name|outcome_subtype|   outcome_type|sex_upon_outcome|
+----------------+---------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-----------+---------------+---------------+----------------+
|         2 weeks|  A684346|        Cat|Domestic Shorthai...|       Orange Tabby|2014-07-07 00:00:00|2014-07-22 16:04:00|2014-07-22 16:04:00|       null|        Partner|       Transfer|     Intact Male|
|          1 year|  A666430|        Dog|          Beagle Mix|        White/Brown|2012-11-06 00:00:00|2013-11-07 11:47:00|2013-11-07 11:47:00|       Lucy|        Partner|       Transfer|   

###Turn Spark DataFrames into Pandas Dataframes

In [68]:
#turn this into a pandas dataframe 
import pandas as pd

In [69]:
df = uncleaned_shelter_df.toPandas()


In [70]:
df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07,2014-07-22 16:04:00,2014-07-22 16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07 11:47:00,2013-11-07 11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03 14:20:00,2014-06-03 14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15 15:50:00,2014-06-15 15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07,2014-07-07 14:04:00,2014-07-07 14:04:00,,Rabies Risk,Euthanasia,Unknown


In [71]:
breed_df = dog_breed_df.toPandas()

In [72]:
breed_df.head()

Unnamed: 0,Breed,Size,Average_life_expectancy
0,Pit Bull Mix,large,11
1,Chihuahua Shorthair mix,small,15
2,Labrador Retriever Mix,large,13
3,German Sheperd Mix,large,10
4,Australian Cattle Dog Mix,medium,13


###Clean the shelter dataframe

In [73]:
dfdogs = df[df["animal_type"] == "Dog"]

In [74]:
dfdogs

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06,2013-11-07 11:47:00,2013-11-07 11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31,2014-06-03 14:20:00,2014-06-03 14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02,2014-06-15 15:50:00,2014-06-15 15:50:00,Monday,Partner,Transfer,Neutered Male
5,4 months,A664462,Dog,Leonberger Mix,Brown/White,2013-06-03,2013-10-07 13:06:00,2013-10-07 13:06:00,*Edgar,Partner,Transfer,Intact Male
7,3 years,A692618,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23,2014-12-08 15:55:00,2014-12-08 15:55:00,*Ella,Partner,Transfer,Spayed Female
...,...,...,...,...,...,...,...,...,...,...,...,...
78250,1 month,A764895,Dog,Golden Retriever/Labrador Retriever,Brown/White,2017-12-04,2018-02-01 18:40:00,2018-02-01 18:40:00,,Foster,Adoption,Neutered Male
78251,1 month,A764894,Dog,Golden Retriever/Labrador Retriever,Brown/White,2017-12-04,2018-02-01 18:26:00,2018-02-01 18:26:00,,Foster,Adoption,Spayed Female
78252,3 years,A764468,Dog,Mastiff Mix,Blue/White,2014-12-30,2018-02-01 18:06:00,2018-02-01 18:06:00,Max,,Adoption,Neutered Male
78254,2 months,A765858,Dog,Standard Schnauzer,Red,2017-11-13,2018-02-01 18:32:00,2018-02-01 18:32:00,,,Adoption,Spayed Female


In [75]:
dfdogs_cleaned = dfdogs.drop(columns = ["age_upon_outcome", "animal_type", "color", "name", "outcome_subtype", "monthyear"])

In [76]:
dfdogs_cleaned.head(15)

Unnamed: 0,animal_id,breed,date_of_birth,datetime,outcome_type,sex_upon_outcome
1,A666430,Beagle Mix,2012-11-06,2013-11-07 11:47:00,Transfer,Spayed Female
2,A675708,Pit Bull,2013-03-31,2014-06-03 14:20:00,Adoption,Neutered Male
3,A680386,Miniature Schnauzer Mix,2005-06-02,2014-06-15 15:50:00,Transfer,Neutered Male
5,A664462,Leonberger Mix,2013-06-03,2013-10-07 13:06:00,Transfer,Intact Male
7,A692618,Chihuahua Shorthair Mix,2011-11-23,2014-12-08 15:55:00,Transfer,Spayed Female
11,A673652,Papillon/Border Collie,2012-02-28,2014-03-28 14:39:00,Transfer,Neutered Male
12,A677679,Chihuahua Shorthair/Pomeranian,2014-03-07,2014-05-26 19:10:00,Adoption,Neutered Male
13,A640655,Miniature Schnauzer/Miniature Poodle,2009-04-27,2014-04-25 11:17:00,Return to Owner,Spayed Female
14,A690350,Labrador Retriever Mix,2006-10-18,2014-10-26 18:20:00,Return to Owner,Neutered Male
15,A680396,Rat Terrier Mix,2012-06-02,2014-06-15 15:11:00,Transfer,Neutered Male


In [77]:
dfdogs_cleaned["date_of_birth"] = pd.to_datetime(dfdogs_cleaned["date_of_birth"])
dfdogs_cleaned["datetime"] = pd.to_datetime(dfdogs_cleaned["datetime"])

In [78]:
dfdogs_cleaned.head(60)

Unnamed: 0,animal_id,breed,date_of_birth,datetime,outcome_type,sex_upon_outcome
1,A666430,Beagle Mix,2012-11-06,2013-11-07 11:47:00,Transfer,Spayed Female
2,A675708,Pit Bull,2013-03-31,2014-06-03 14:20:00,Adoption,Neutered Male
3,A680386,Miniature Schnauzer Mix,2005-06-02,2014-06-15 15:50:00,Transfer,Neutered Male
5,A664462,Leonberger Mix,2013-06-03,2013-10-07 13:06:00,Transfer,Intact Male
7,A692618,Chihuahua Shorthair Mix,2011-11-23,2014-12-08 15:55:00,Transfer,Spayed Female
11,A673652,Papillon/Border Collie,2012-02-28,2014-03-28 14:39:00,Transfer,Neutered Male
12,A677679,Chihuahua Shorthair/Pomeranian,2014-03-07,2014-05-26 19:10:00,Adoption,Neutered Male
13,A640655,Miniature Schnauzer/Miniature Poodle,2009-04-27,2014-04-25 11:17:00,Return to Owner,Spayed Female
14,A690350,Labrador Retriever Mix,2006-10-18,2014-10-26 18:20:00,Return to Owner,Neutered Male
15,A680396,Rat Terrier Mix,2012-06-02,2014-06-15 15:11:00,Transfer,Neutered Male


###Write the Cleaned Dataframes into tables in RDS

In [79]:
# Store environmental variable
from getpass import getpass
password = getpass('Whitedog24')
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://dataviz.cbl71rrsnrzm.us-east-1.rds.amazonaws.com:5432/database_practice"
config = {"user":"postgres",
          "password": "Whitedog24",
          "driver":"org.postgresql.Driver"}

Whitedog24··········


In [85]:
# Write breed_df to table in RDS (The csv that lexie made)
breed_df.write.jdbc(url=jdbc_url, table='breed_info', mode=mode, properties=config)

AttributeError: ignored

In [None]:
# Write review_id_df to table in RDS (The data that is cleaned from kaggle)
#dfdogs_cleaned.write.jdbc(url=jdbc_url, table='shelter_info', mode=mode, properties=config)

###Join the Two Tables on the Dog Breed Column

In [None]:
#Concatenate the two dataframes