In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.core.pylabtools as pylabtools
import scipy.stats

import seaborn as sns

def mask_data(data,*args):
    mask_and=np.array(args).all(0)
    return data[mask_and]

# [Need only run the first time]


# Load Data from a csv version of the original data and convert the intervention date and time information to pandas format and then save

** This is time consuming so saving it once and for all is useful **

** I could drop the original variables but I want the stored data to be compatible with others' code so do not do so **

In [None]:
# Read original data stored as
original_data_file="hackdata.csv" 
data=pd.read_csv(original_data_file)

# drop the data that contains no information of the date of call
data.drop(data[pd.isnull(data.FIPDATEINTERVENTIONYYYY)].index,inplace=True)

# convert the date to pandas datetime format for easy manipulation
data['date']=pd.to_datetime(data.FIPDATEINTERVENTION)

# create a variable for day of week. this is contained in the original data but I calculate it using pandas to avoid
# any mistake the original data contains. all the information we need is after all complete contained in 'FIPDATEINTERVENTION'
# and 'FIPHEUREALARME'

data['day']=map(lambda x: pd.datetime.weekday(x),data.date)

# convert the time of call to pandas datetime format
times=pd.to_datetime(data.FIPHEUREALARME)

# convert the minutes part to fractional hours and record time in hours
data['time']=map(lambda x: pd.datetime.time(x).hour + pd.datetime.time(x).minute/60.0,times)

# create a new variable for month. again contained in the orignial data but I calculate it from the date
data['month']=map(lambda x: x.month,data.date)

# store the data
data.to_csv("processed_data.csv",index=False)

# If not running for the first time, this is the place to start

In [2]:
# Load the previously saved processed data
data=pd.read_csv("processed_data.csv")

# the date variable is not loaded directly as a datetime object so convert it to the latter
data['date']=pd.to_datetime(data.date)

  interactivity=interactivity, compiler=compiler, result=result)


# Exploration

## Check if calls are uniformly spread over the week 

We perform a chi square test for goodness of fit after plotting the data

In [None]:
nd=pd.DataFrame({'total' : data.groupby(['day']).size()}).reset_index()


In [None]:
plt.scatter(x="day",y="total",data=nd)
plt.xlabel("day of week")
plt.ylabel("number of calls")

In [None]:
scipy.stats.chisquare(nd.total)

Since the p-value is essentially zero, we can **rule out the null hypothesis**. From the graph **it seems weekends get lower calls.** **Friday also seems to stand out as getting excessive calls.** Would be useful to check time of day to see if calls are more from after office partying. 

## What about number of calls per day over the entire period?

In [None]:
nd=pd.DataFrame({'total' : data.groupby(['date']).size()}).reset_index()

In [None]:
nd['int_date']=(nd.date-nd.date[0])
nd['int_date']=map(lambda x : x.days,nd.int_date)    # because some dates are missing

In [None]:
plt.plot(nd.int_date,nd.total)

In [None]:
scipy.stats.chisquare(nd.total)

**We can rule out the null hypothesis that the number of a calls per day has been constant over the duration.**

## See the effect of time of day

In [None]:
data.time.hist(bins=50)

## See the effect of trauma status and time of day

In [None]:
nd=pd.DataFrame({'total' : data.groupby(['FIPTRAUMA','time_bin']).size()}).reset_index()
nd.head()

In [None]:
sns.barplot(x='time_bin',y='total',data=mask_data(nd,nd.FIPTRAUMA==2),color='blue',label="trauma_status=2")
sns.barplot(x='time_bin',y='total',data=mask_data(nd,nd.FIPTRAUMA==1),color='green',label="trauma_status=1")
plt.ylabel('')
plt.xlabel('time')
plt.legend()

In [None]:
nd=pd.DataFrame({'total' : data.groupby(['FIPTRAUMA','day']).size()}).reset_index()
nd.head()

In [None]:
sns.barplot(x='day',y='total',data=mask_data(nd,nd.FIPTRAUMA==2),color='blue',label="trauma_status=2")
sns.barplot(x='day',y='total',data=mask_data(nd,nd.FIPTRAUMA==1),color='green',label="trauma_status=1")
plt.ylabel('')
plt.xlabel('day')
plt.legend()

## See the effect of region and  time of day

In [None]:
data.NOLOCALITEPRISE=np.where(data.NOLOCALITEPRISE.isnull(),6621,data.NOLOCALITEPRISE)

In [None]:
regions=data.NOLOCALITEPRISE.unique()

In [None]:
mask_data(data,data.NOLOCALITEPRISE==6621).time.hist()

In [None]:
mask_data(data,data.NOLOCALITEPRISE==6643).time.hist()

In [None]:
sect_geo_pec=data.SECT_GEO_PEC.unique()

pylabtools.figsize(15,9)
number_columns=3
number_rows=2
for i,j in enumerate([(i/number_columns,i%number_columns) for i in range(number_columns*number_rows)]):
    try:
        cond=sect_geo_pec[i]
        plt.subplot2grid((number_rows,number_columns),j)
        mask_data(data,data.SECT_GEO_PEC==cond).time.hist()
        plt.title(sect_geo_pec[i])
        plt.legend()
    except:
        break

In [None]:

sect_geo_pec_2=data.SECT_GEO_2_PEC.unique()

pylabtools.figsize(15,9)
number_columns=3
number_rows=2
for i,j in enumerate([(i/number_columns,i%number_columns) for i in range(number_columns*number_rows)]):
    try:
        cond=sect_geo_pec_2[i]
        plt.subplot2grid((number_rows,number_columns),j)
        mask_data(data,data.SECT_GEO_2_PEC==cond).time.hist()
        plt.title(sect_geo_pec[i])
        plt.legend()
    except:
        break

## Priority and time

In [None]:


pylabtools.figsize(15,5)
number_columns=3
number_rows=1
for i,j in enumerate([(i/number_columns,i%number_columns) for i in range(number_columns*number_rows)]):
    plt.subplot2grid((number_rows,number_columns),j)
    mask_data(data,data.NOPRIORITE==i+1).time.hist()
    plt.title("Priority = %d" % (i+1))
    plt.legend()

This gives some indication that non-urgenet ones peak at 10 compared to others 

In [None]:

data[data.FIPMINDELAIDEPART.isnull()]=data.FIPMINDELAIDEPART.mean()


pylabtools.figsize(15,5)
number_columns=3
number_rows=1
for i,j in enumerate([(i/number_columns,i%number_columns) for i in range(number_columns*number_rows)]):
    plt.subplot2grid((number_rows,number_columns),j)
    mask_data(data,data.NOPRIORITE==i+1).FIPMINDELAIDEPART.hist()
    plt.title("Priority = %d" % (i+1))
    plt.legend()

In [None]:
[mask_data(data,data.NOPRIORITE==i+1).FIPMINDELAIDEPART.max()/60.0 for i in range(3)]

### Sometimes they leave in 1 day??

## Score by gravity

In [None]:
data[data.NONACA.isnull()]=data.NONACA.mean()
data['NONACA']=map(lambda x: int(x),data.NONACA)

pylabtools.figsize(15,10)
number_columns=3
number_rows=4
for i,j in enumerate([(i/number_columns,i%number_columns) for i in range(number_columns*number_rows)]):
    plt.subplot2grid((number_rows,number_columns),j)
    mask_data(data,data.NONACA==i).time.hist()
    plt.title("Priority = %d" % (i))
    plt.legend()

In [None]:
data.NONACA.unique()