In [46]:
import pandas as pd 

In [47]:
df = pd.read_csv('NYC_Dog_Licensing_Dataset_.csv')

  df = pd.read_csv('NYC_Dog_Licensing_Dataset_.csv')


In [48]:
df.dtypes

AnimalName             object
AnimalGender           object
AnimalBirthYear        object
BreedName              object
ZipCode               float64
LicenseIssuedDate      object
LicenseExpiredDate     object
Extract Year            int64
dtype: object

In [49]:
df.head(100)

Unnamed: 0,AnimalName,AnimalGender,AnimalBirthYear,BreedName,ZipCode,LicenseIssuedDate,LicenseExpiredDate,Extract Year
0,PAIGE,F,2014,American Pit Bull Mix / Pit Bull Mix,10035.0,09/12/2014,09/12/2017,2016
1,YOGI,M,2010,Boxer,10465.0,09/12/2014,10/02/2017,2016
2,ALI,M,2014,Basenji,10013.0,09/12/2014,09/12/2019,2016
3,QUEEN,F,2013,Akita Crossbreed,10013.0,09/12/2014,09/12/2017,2016
4,LOLA,F,2009,Maltese,10028.0,09/12/2014,10/09/2017,2016
...,...,...,...,...,...,...,...,...
95,SPUDS,M,2010,Bulldog,10461.0,09/17/2014,09/17/2017,2016
96,BOO,F,2013,Labradoodle,11211.0,09/17/2014,09/17/2019,2016
97,SAMMY,M,2006,Shiba Inu,10011.0,09/17/2014,10/31/2016,2016
98,MAX,M,2005,Pug,10025.0,09/17/2014,10/30/2016,2016


In [50]:
df.keys()

Index(['AnimalName', 'AnimalGender', 'AnimalBirthYear', 'BreedName', 'ZipCode',
       'LicenseIssuedDate', 'LicenseExpiredDate', 'Extract Year'],
      dtype='object')

### 🛠️ Step 1: Fix the Data Types for Each Column

Ensure all columns have the correct data types to support accurate analysis:
- Convert dates (e.g., `LicenseIssuedDate`) to `datetime`
- Convert numeric values (e.g., `AnimalBirthYear`) to `int`
- Convert `ZipCode` to string to preserve leading zeros and allow proper grouping


In [51]:
# Convert dates to datetime
df['LicenseIssuedDate'] = pd.to_datetime(df['LicenseIssuedDate'], errors='coerce')
df['LicenseExpiredDate'] = pd.to_datetime(df['LicenseExpiredDate'], errors='coerce')


In [52]:
# Convert birth year to numeric
df['AnimalBirthYear'] = pd.to_numeric(df['AnimalBirthYear'], errors='coerce')


In [53]:
# Convert zip code to string so we can group by it without decimal places
df['ZipCode'] = df['ZipCode'].astype('Int64').astype(str)

In [54]:
df['BreedName'] = df['BreedName'].str.strip()
df['AnimalGender'] = df['AnimalGender'].str.strip()


In [55]:
df.dtypes

AnimalName                    object
AnimalGender                  object
AnimalBirthYear              float64
BreedName                     object
ZipCode                       object
LicenseIssuedDate     datetime64[ns]
LicenseExpiredDate    datetime64[ns]
Extract Year                   int64
dtype: object

### Step 2: Check for Full Row Duplicates

Instead of checking for duplicates column by column (which might drop valid but similar entries like dogs with the same name), we checked for **exact duplicate rows**. 

For example, there might be two dogs named Luna, but they could have different birth years, breeds, or license dates, so we **only dropped rows that were completely identical across all columns**, ensuring no meaningful data was lost.


In [56]:
df.duplicated().sum()

40545

In [57]:
df = df.drop_duplicates()

In [58]:
df.duplicated().sum()

0

### 🐾 Step 3: Create Age Groups for Dogs

We used the `ApproxDogAge` column to categorize dogs into meaningful life stages. This helps support business questions about age distribution and trends.

We defined age bins as follows:
- `0–1` → Puppy  
- `2–3` → Young  
- `4–6` → Adult  
- `7–10` → Senior  
- `10+` → Elderly  

We also ensured proper display by replacing any special en dashes (–) with regular hyphens (-) to avoid formatting issues in Excel.


In [59]:
df['LicenseIssuedYear'] = df['LicenseIssuedDate'].dt.year
df['ApproxDogAge'] = df['LicenseIssuedYear'] - df['AnimalBirthYear']

In [60]:
df['DogAgeGroup'] = pd.cut(
    df['ApproxDogAge'],
    bins=[0, 1, 3, 6, 10, 20],
    labels=['0-1 (Puppy)', '2-3 (Young)', '4-6 (Adult)', '7-10 (Senior)', '10+ (Elderly)'],
    include_lowest=True
)
# Clean encoding for Excel display
df['DogAgeGroup'] = df['DogAgeGroup'].str.replace('–', '-', regex=False)


### Step 4: Identify and Handle Missing Values

We checked for missing data across all columns to understand where information was incomplete.

Key findings:
- `AnimalName` had 1,709 missing entries — we decided to keep these since a name doesn't affect license validity.
- `AnimalGender`, `AnimalBirthYear`, and `LicenseExpiredDate` had fewer missing values but were important for analysis.

To maintain data quality:
- We dropped rows where **any of these 3 key columns** were missing:  
  `AnimalBirthYear`, `LicenseExpiredDate`, `AnimalGender`.


In [61]:
df.isnull().sum()

AnimalName            1709
AnimalGender            21
AnimalBirthYear         30
BreedName                0
ZipCode                  0
LicenseIssuedDate        0
LicenseExpiredDate      79
Extract Year             0
LicenseIssuedYear        0
ApproxDogAge            30
DogAgeGroup            423
dtype: int64

In [62]:
df[df[['AnimalBirthYear', 'LicenseExpiredDate', 'AnimalGender']].isnull().any(axis=1)].shape[0]


130

This step removed 130 rows, ensuring our data remains reliable for further analysis.


In [63]:
# Keep only rows where all three of these values are filled in. Otherwise, lose 130 rows! 
df = df.dropna(subset=['AnimalBirthYear', 'LicenseExpiredDate', 'AnimalGender'])


### Step 5: Clean Invalid ZIP Codes

To ensure accuracy in geographic analysis, we filtered the ZIP codes so that only valid entries remain.

- A valid NYC ZIP code must be **5 digits** long.
- Some rows had invalid or missing ZIP codes (like `0`, `100`, or empty), which we removed.


In [64]:
df = df[df['ZipCode'].str.len() == 5]


### Step 6: Validate Extract Year

We found rows where the `Extract Year` didn't make logical sense. For example, if a dog's license expired in 2021 but the extract year was 2022, that implies the license was still active — which can't be true.

✅ Our fix:
- We ensured the extract year falls between the license issue and license expiration years


In [65]:
# Extract the year from issued and expired dates
df['IssueYear'] = df['LicenseIssuedDate'].dt.year
df['ExpireYear'] = df['LicenseExpiredDate'].dt.year

# Keep only rows where extract year is between issue and expire years
df = df[
    (df['Extract Year'] >= df['IssueYear']) &
    (df['Extract Year'] <= df['ExpireYear'])
]


In [66]:
#its redundant
df = df.drop(columns=['LicenseIssuedYear'])

In [68]:
df.to_csv('Cleaned_NYC_Dog_License.csv', index=False, encoding='utf-8-sig')
