In [1]:
import numpy as np
import pandas as pd

In [2]:
from sqlalchemy import create_engine
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Animal_Shelter"
engine = create_engine(db_string)

In [3]:
intakes_outcomes_df = pd.read_sql_table('intakes_outcomes', con=engine)
intakes_outcomes_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,age_upon_outcome
0,A720371,2016-02-08,Stray,Normal,Dog,Male,0.0,2016-02-13,Adoption or RTO,0.0
1,A659412,2018-06-13,Owner Surrender,Normal,Dog,Female,5.0,2020-10-05,Adoption or RTO,7.0
2,A814515,2020-03-01,Owner Surrender,Normal,Dog,Male,2.0,2020-05-06,Adoption or RTO,2.0
3,A689724,2014-10-08,Stray,Normal,Cat,Male,0.0,2014-10-18,Adoption or RTO,0.0
4,A680969,2014-06-10,Stray,Sick,Cat,Male,0.0,2014-08-05,Adoption or RTO,0.0
...,...,...,...,...,...,...,...,...,...,...
99196,A828252,2021-01-12,Stray,Normal,Cat,Female,0.0,NaT,,
99197,A828480,2021-01-18,Public Assist,Normal,Dog,Female,2.0,NaT,,
99198,A791070,2019-03-31,Public Assist,Normal,Dog,Male,1.0,NaT,,
99199,A820274,2020-07-16,Stray,Normal,Dog,Male,6.0,NaT,,


In [4]:
intakes_outcomes_df.dtypes

animal_id                   object
intake_date         datetime64[ns]
intake_type                 object
intake_condition            object
animal_type                 object
sex_upon_intake             object
age_upon_intake            float64
outcome_date        datetime64[ns]
outcome_type                object
age_upon_outcome           float64
dtype: object

In [5]:
# Count number of NaN values
intakes_outcomes_df.isnull().sum(axis = 0)

animal_id             0
intake_date           0
intake_type           0
intake_condition      0
animal_type           0
sex_upon_intake       0
age_upon_intake       0
outcome_date        421
outcome_type        421
age_upon_outcome    421
dtype: int64

In [6]:
# Replace NaN values 
intakes_outcomes_df[['outcome_type']] = intakes_outcomes_df[['outcome_type']].fillna('Still in center')
intakes_outcomes_df['outcome_date'] = intakes_outcomes_df['outcome_date'].fillna(intakes_outcomes_df['intake_date'])
intakes_outcomes_df['age_upon_outcome'] = intakes_outcomes_df['age_upon_outcome'].fillna(intakes_outcomes_df['age_upon_intake'])

In [7]:
intakes_outcomes_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,age_upon_outcome
0,A720371,2016-02-08,Stray,Normal,Dog,Male,0.0,2016-02-13,Adoption or RTO,0.0
1,A659412,2018-06-13,Owner Surrender,Normal,Dog,Female,5.0,2020-10-05,Adoption or RTO,7.0
2,A814515,2020-03-01,Owner Surrender,Normal,Dog,Male,2.0,2020-05-06,Adoption or RTO,2.0
3,A689724,2014-10-08,Stray,Normal,Cat,Male,0.0,2014-10-18,Adoption or RTO,0.0
4,A680969,2014-06-10,Stray,Sick,Cat,Male,0.0,2014-08-05,Adoption or RTO,0.0
...,...,...,...,...,...,...,...,...,...,...
99196,A828252,2021-01-12,Stray,Normal,Cat,Female,0.0,2021-01-12,Still in center,0.0
99197,A828480,2021-01-18,Public Assist,Normal,Dog,Female,2.0,2021-01-18,Still in center,2.0
99198,A791070,2019-03-31,Public Assist,Normal,Dog,Male,1.0,2019-03-31,Still in center,1.0
99199,A820274,2020-07-16,Stray,Normal,Dog,Male,6.0,2020-07-16,Still in center,6.0


In [8]:
intakes_outcomes_df.dtypes

animal_id                   object
intake_date         datetime64[ns]
intake_type                 object
intake_condition            object
animal_type                 object
sex_upon_intake             object
age_upon_intake            float64
outcome_date        datetime64[ns]
outcome_type                object
age_upon_outcome           float64
dtype: object

In [9]:
# convert "age_upon_outcome" to int
intakes_outcomes_df['age_upon_outcome'] = intakes_outcomes_df['age_upon_outcome'].astype(int)

In [10]:
# convert object to date
intakes_outcomes_df['intake_date'] = pd.to_datetime(intakes_outcomes_df['intake_date'])
intakes_outcomes_df['outcome_date'] = pd.to_datetime(intakes_outcomes_df['outcome_date'])

In [11]:
# create new column to calculate how long each animal stayed in the center
intakes_outcomes_df['days_in_center'] = (intakes_outcomes_df['outcome_date'] - intakes_outcomes_df['intake_date']).dt.days

In [12]:
intakes_outcomes_df["days_in_center"].unique()

array([    5,   845,    66, ..., -1100,  -655, -2219])

In [13]:
# Drop negative(invalid) values on "days in center"
intakes_outcomes_df.drop(intakes_outcomes_df[intakes_outcomes_df['days_in_center'] < 0 ].index , inplace=True)

In [14]:
# Sort data by intake date
intakes_outcomes_df = intakes_outcomes_df.sort_values(by='intake_date')
intakes_outcomes_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,age_upon_outcome,days_in_center
69267,A664321,2013-10-01,Stray,Normal,Dog,Female,1.0,2013-10-05,Transfer,1,4
29304,A664253,2013-10-01,Public Assist,Normal,Dog,Male,2.0,2013-10-07,Adoption or RTO,2,6
40170,A664293,2013-10-01,Stray,Normal,Cat,Male,0.0,2013-10-12,Transfer,0,11
53916,A664299,2013-10-01,Stray,Normal,Dog,Male,0.0,2013-10-05,Transfer,0,4
83252,A664309,2013-10-01,Stray,Normal,Dog,Male,1.0,2013-10-16,Transfer,1,15
...,...,...,...,...,...,...,...,...,...,...,...
99012,A830150,2021-03-03,Stray,Normal,Dog,Male,2.0,2021-03-03,Still in center,2,0
49474,A830183,2021-03-03,Stray,Sick,Dog,Female,0.0,2021-03-03,Still in center,0,0
49554,A830131,2021-03-03,Stray,Sick,Dog,Male,0.0,2021-03-03,Still in center,0,0
49486,A830158,2021-03-03,Stray,Normal,Dog,Male,2.0,2021-03-03,Still in center,2,0


In [15]:
# Filter date between 2014-01-01 and 2020-12-31
intakes_outcomes_df = intakes_outcomes_df.loc[(intakes_outcomes_df['intake_date'] >= '2014-01-01') & (intakes_outcomes_df['intake_date'] <= '2020-12-31')]
intakes_outcomes_df = intakes_outcomes_df.loc[(intakes_outcomes_df['outcome_date'] >= '2014-01-01') & (intakes_outcomes_df['outcome_date'] <= '2020-12-31')]

In [16]:
intakes_outcomes_df.to_csv('Data/clean_data.csv', index = False)  

In [17]:
intakes_outcomes_df.to_sql(name='clean_data', con=engine, index=False)