### 1.4: Some statistics with pandas

In [40]:
# import moduled
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### 1. Load in the SF crimes dataset:

In [2]:
crime = pd.read_csv('../../assets/datasets/sf_crime.csv')

#### 2. Print the head

In [9]:
crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
860542,2003-03-23 23:27:00,ARSON,ARSON OF A VEHICLE,Sunday,BAYVIEW,NONE,0 Block of HUNTERS PT EXPWY EX,-122.376945,37.733018
648096,2006-03-07 06:45:00,LARCENY/THEFT,PETTY THEFT FROM LOCKED AUTO,Tuesday,NORTHERN,NONE,0 Block of MARINA BL,-122.432952,37.805052
790167,2004-03-06 03:00:00,NON-CRIMINAL,LOST PROPERTY,Saturday,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,37.775421
254945,2011-12-03 12:10:00,BURGLARY,"BURGLARY OF STORE, UNLAWFUL ENTRY",Saturday,TARAVAL,"ARREST, BOOKED",3200 Block of 20TH AV,-122.475647,37.728528
876275,2003-01-10 00:15:00,LARCENY/THEFT,PETTY THEFT OF PROPERTY,Friday,NORTHERN,NONE,POLK ST / BROADWAY ST,-122.421772,37.795946


#### 3. Print the unique categories and how many there are

In [12]:
print(np.unique(crime.Category.values))
print(len(np.unique(crime.Category.values)))


['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS' 'EMBEZZLEMENT'
 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING' 'FRAUD' 'GAMBLING'
 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING' 'MISSING PERSON'
 'NON-CRIMINAL' 'OTHER OFFENSES' 'PROSTITUTION' 'RECOVERED VEHICLE'
 'ROBBERY' 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES FORCIBLE'
 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']
38


#### 4. Print the unique districts and how many there are

In [13]:
print(np.unique(crime.PdDistrict.values))
print(len(np.unique(crime.PdDistrict.values)))

['BAYVIEW' 'CENTRAL' 'INGLESIDE' 'MISSION' 'NORTHERN' 'PARK' 'RICHMOND'
 'SOUTHERN' 'TARAVAL' 'TENDERLOIN']
10


#### 5. Make a DataFrame of the crime categories and the number of crimes per category

In [27]:
categories = np.unique(crime.Category.values)
counts = [crime[crime['Category'] == x].shape[0] for x in categories]
categories = pd.DataFrame({'crime_category':categories, 'crimes':counts})
print(categories.head())

  crime_category  crimes
0          ARSON      52
1        ASSAULT    2164
2     BAD CHECKS      14
3        BRIBERY       8
4       BURGLARY    1023


#### 6. Make a DataFrame of the districts and crime counts per district

In [28]:
districts = np.unique(crime.PdDistrict.values)
counts = [crime[crime['PdDistrict'] == x].shape[0] for x in districts]
districts = pd.DataFrame({'district':districts, 'crimes':counts})
print(districts.head())

   crimes   district
0    2555    BAYVIEW
1    2424    CENTRAL
2    2256  INGLESIDE
3    3416    MISSION
4    3076   NORTHERN


#### 7. Make a DataFrame of the day of week and crime counts per day

In [30]:
days = np.unique(crime.DayOfWeek.values)
counts = [crime[crime['DayOfWeek'] == x].shape[0] for x in days]
days = pd.DataFrame({'day_of_week':days, 'crimes':counts})
print(days.head())

   crimes day_of_week
0    3883      Friday
1    3524      Monday
2    3496    Saturday
3    3313      Sunday
4    3579    Thursday


#### 8. Describe one of the datasets

In [31]:
days.describe()

Unnamed: 0,crimes
count,7.0
mean,3571.428571
std,173.074991
min,3313.0
25%,3510.0
50%,3548.0
75%,3618.0
max,3883.0


#### 9. Calculate the mean difference between the experimental and control below

In [38]:
control = [4.8,17.3,7.0,10.8,9.4,2.6,9.0,9.6,12.7,8.5,15.8,11.0,9.7,13.1,6.3,4.4,7.8,
           9.9,7.8,7.7,12.8,13.2,6.1,7.1,6.4,11.7,11.8,6.3,14.4,9.8,14.9,11.4,9.1,10.4,
           13.2,10.0,13.5,10.1,15.0,8.8,5.3,15.1,7.6,9.9,4.9,9.2,12.3,12.1,6.9,8.3,7.0,
           6.1,13.4,11.5,6.0,12.3,5.6,14.7,3.7,7.6,10.9,10.3,10.3,10.2,7.1,6.3,13.2,9.9,
           10.6,9.8,4.9,16.9,9.1,6.7,12.2,8.2,10.1,15.5,11.9,9.7,7.9,13.4,10.6,7.9,13.1,
           11.9,11.6,2.9,13.9,14.4,8.6,7.6,8.3,12.0,5.2,7.1,13.6,6.5,9.9,14.8]

In [39]:
experimental = [15.4,14.7,14.0,25.3,15.1,14.6,0.0,20.5,16.9,18.9,20.1,16.5,13.1,16.4,
                18.3,21.3,6.3,9.0,12.1,7.4,25.6,10.8,11.4,12.1,22.7,20.1,18.3,9.4,11.0,
                14.8,12.5,12.3,16.4,13.3,14.1,15.7,12.2,15.2,16.9,1.5,18.6,4.2,12.2,8.2,
                15.3,9.3,12.0,22.8,19.7,10.7,17.8,15.4,14.5,22.2,16.7,19.3,17.1,17.4,5.4,
                10.9,13.7,17.1,11.2,15.0,18.2,14.2,11.0,19.6,10.9,13.9,17.3,11.1,19.9,
                11.2,25.1,23.7,16.5,17.9,20.1,15.8,26.4,17.7,18.9,5.6,25.6,10.4,15.5,17.7,
                22.9,18.9,21.2,5.5,13.2,13.4,21.9,11.1,13.7,16.9,16.4,18.8 ]

#### 10. Calculate the t-statistic for the difference between means by hand (if you forgot it, look it up online or in the notes!)

The standard deviation component is the standard deviation across groups.

#### 11. Plot the control and experimental distributions using seaborn on the same plot (with different colors for each)

#### 11. [BONUS] Write a function to bootstrap the 99% confidence interval for the t-statistic.

Look up: np.percentile!