## Debugging Clean_Data DateTime

In [1]:
import pandas as pd

df = pd.read_csv('../raw_data/data.csv', low_memory = False)

In [2]:
df.shape

(6983207, 22)

In [3]:
def drop_nan(df):
    '''
    Returns a dataframe without NaN
    '''
    df = df.copy()
    #drop precinct missing values
    not_unknown = df['precinct_number'] != -99.0
    not_nan = df['precinct_number'] == df['precinct_number']
    df = df[not_unknown & not_nan]
    #drop time and date missing values
    df = df[df['time'] == df['time']]
    df = df[df['date'] == df['date']]
    #drop offense_type and crime_completed
    df = df[df['offense_type'] == df['offense_type']]
    df = df[df['crime_completed'] == df['crime_completed']]
    return df

In [4]:
df = drop_nan(df)

In [5]:
from datetime import datetime

def to_date_format(df):
    '''
    Returns dataframe with 'date' as datetime dtype
    Filters dataframe to show only complaints dated 2007 onwards
    '''
    df = df.copy()
    df['date'] = df['date'].apply(lambda x: \
                                datetime.strptime(x, '%m/%d/%Y'))
    print(type(df['date'][345_687]))
    df = df[df['date'] > datetime(2006, 12, 31, 0, 0)]
    print(type(df['date'][345_687]))
    df['date'] = df['date'].apply(lambda x: x.date())
    print(type(df['date'][345_687]))
    return df

In [7]:
df = to_date_format(df)

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.date'>


In [8]:
type(df['date'][345_687])

datetime.date

In [9]:
df['date'][345_687]

datetime.date(2008, 10, 17)

In [10]:
df['date'].apply(type).value_counts()

<class 'datetime.date'>    6416137
Name: date, dtype: int64

*In jupyter 'Pickle test', merged 'date' and 'time' into period column so easier to manipulate and exported as pickle (so datetime format is preserved: dtype is timestamp now!)*

## Data Viz & Modelling: Time Columns

### Load DataFrame + Imports

In [54]:
import pickle
from datetime import datetime

In [51]:
with open('../raw_data/clean.pickle', 'rb') as f:
    df = pickle.load(f)

In [52]:
df.head()

Unnamed: 0,complaint_id,precinct_number,offense_type,crime_completed,offense_level,premise_desc,premise,jurisdiction,park_name,suspect_age,...,suspect_sex,latitude,longitude,metro,victim_age,victim_race,victim_sex,period,borough,patrol_borough
0,325341655,73,OFFENSES AGAINST PUBLIC ADMINI,COMPLETED,MISDEMEANOR,INSIDE,RESIDENCE - PUBLIC HOUSING,N.Y. HOUSING POLICE,NOT PARK,25-44,...,M,40.664239,-73.908425,NOT SUBWAY,<18,BLACK,M,2015-02-11 15:00:00,BROOKLYN,PATROL BORO BKLYN NORTH
1,393816841,69,ASSAULT 3 & RELATED OFFENSES,COMPLETED,MISDEMEANOR,INSIDE,RESIDENCE-HOUSE,N.Y. POLICE DEPT,NOT PARK,UNKNOWN,...,UNKNOWN,40.64459,-73.892672,NOT SUBWAY,45-64,BLACK,F,2012-03-17 10:30:00,BROOKLYN,PATROL BORO BKLYN SOUTH
2,802896158,71,HARRASSMENT 2,COMPLETED,VIOLATION,INSIDE,PUBLIC SCHOOL,N.Y. POLICE DEPT,NOT PARK,<18,...,M,40.658758,-73.942435,NOT SUBWAY,18-24,BLACK,M,2016-10-27 13:48:00,BROOKLYN,PATROL BORO BKLYN SOUTH
3,633812343,112,RAPE,COMPLETED,FELONY,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,NOT PARK,25-44,...,M,40.722364,-73.851474,NOT SUBWAY,25-44,WHITE,F,2014-11-27 19:00:00,QUEENS,PATROL BORO QUEENS NORTH
4,300349533,24,GRAND LARCENY,COMPLETED,FELONY,INSIDE,DOCTOR/DENTIST OFFICE,N.Y. POLICE DEPT,NOT PARK,UNKNOWN,...,UNKNOWN,40.793465,-73.96895,NOT SUBWAY,45-64,WHITE,F,2013-12-11 13:30:00,MANHATTAN,PATROL BORO MAN NORTH


In [53]:
type(df['period'][0])

datetime.datetime

### Generate New Columns

#### Year Column

In [55]:
df['year'] = df['period'].apply(lambda x: x.year)

In [56]:
df['month'] = df['period'].apply(lambda x: x.month)

In [57]:
df['day_of_week'] = df['period'].apply(lambda x: x.isoweekday()) # 1 is Monday, 7 is Sunday

In [63]:
df['hour'] = df['period'].apply(lambda x: x.hour)

In [66]:
new_date = df['period'][2].replace(minute = 0, second = 0)