### Un-pickle the data 

In [1]:
import pickle

with open('homeless_data.pkl', 'rb') as f:
    data = pickle.load(f)

### Data exploration 

In [2]:
data.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


### Data Summarization 

In [3]:
round(data['state_pop'].mean(),2)

6405637.27

In [4]:
data['state_pop'].median()

4461153.0

In [5]:
data['state_pop'].min()

577601

In [6]:
data['state_pop'].max()

39461588

In [7]:
data[data['state_pop'] == data['state_pop'].max()]

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588


### .agg() method 

In [8]:
def pct90 (column):
    return column.quantile(0.9)

data['state_pop'].agg(pct90) # prints out the 90th percentile value of total_pop

12723071.0

In [9]:
def pct95(column):
    return column.quantile(0.95)

data['state_pop'].agg([pct90,pct95]) # 90th and 95th percentile

pct90    12723071.0
pct95    20387334.0
Name: state_pop, dtype: float64

## Walmart dataset 

In [10]:
import pickle

with open('walmart_sales.pkl', 'rb') as f:
    data1 = pickle.load(f)

In [11]:
data1.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
2,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
3,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
4,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106


### Dropping duplicate names 

In [12]:
data1.drop_duplicates(subset = 'date')

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.50,False,5.727778,0.679451,8.106
73,1,A,1,2010-02-12,46039.49,True,3.616667,0.673111,8.106
145,1,A,1,2010-02-19,41595.55,False,4.405556,0.664129,8.106
218,1,A,1,2010-02-26,19403.54,False,8.127778,0.676545,8.106
290,1,A,1,2010-03-05,21827.90,False,8.055556,0.693452,8.106
...,...,...,...,...,...,...,...,...,...
9883,1,A,1,2012-09-28,18947.81,False,24.488889,0.968455,6.908
9956,1,A,1,2012-10-05,21904.47,False,20.305556,0.955511,6.573
10028,1,A,1,2012-10-12,22764.01,False,17.216667,0.951284,6.573
10101,1,A,1,2012-10-19,24185.27,False,19.983333,0.949435,6.573


### Dropping pair of duplicates 

In [13]:
data1.drop_duplicates(subset=['date','department'])

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.50,False,5.727778,0.679451,8.106
1,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
2,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
3,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
4,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106
...,...,...,...,...,...,...,...,...,...
339803,35,B,78,2012-09-07,3.00,True,24.444444,1.033177,8.839
361905,39,A,99,2010-04-23,0.01,False,19.800000,0.738361,8.464
385146,41,A,78,2011-03-25,0.00,False,5.061111,0.901884,7.241
389539,41,A,77,2012-06-01,8.44,False,14.050000,0.994344,6.547


### counting

In [14]:
data1['type'].value_counts(sort = True)

A    215478
B    155044
C     42597
Name: type, dtype: int64

In [15]:
data['region'].value_counts(sort = True)

South Atlantic        9
Mountain              8
West North Central    7
New England           6
Pacific               5
East North Central    5
West South Central    4
East South Central    4
Mid-Atlantic          3
Name: region, dtype: int64

### Calculating proportion of total 

In [16]:
data1['type'].value_counts(normalize = True)

A    0.521588
B    0.375301
C    0.103111
Name: type, dtype: float64

## Grouped Summary Stats 

In [17]:
data[data['region'] == 'South Atlantic']['state_pop'].mean()

7247736.0

In [18]:
data[data['region'] == 'Mountain']['state_pop'].mean()

3063968.125

In [19]:
data[data['region'] == 'New England']['state_pop'].mean()

2471553.6666666665

### .groupby() 

In [20]:
data.groupby('region')['state_pop'].mean()

region
East North Central    9.377277e+06
East South Central    4.775371e+06
Mid-Atlantic          1.373910e+07
Mountain              3.063968e+06
New England           2.471554e+06
Pacific               1.066462e+07
South Atlantic        7.247736e+06
West North Central    3.050034e+06
West South Central    1.005958e+07
Name: state_pop, dtype: float64

In [21]:
data.groupby('region')['state_pop'].agg([min,max])

Unnamed: 0_level_0,min,max
region,Unnamed: 1_level_1,Unnamed: 2_level_1
East North Central,5807406,12723071
East South Central,2981020,6771631
Mid-Atlantic,8886025,19530351
Mountain,577601,7158024
New England,624358,6882635
Pacific,735139,39461588
South Atlantic,701547,21244317
West North Central,758080,6121623
West South Central,3009733,28628666


### Grouping by multiple variables 

In [22]:
data.groupby(['region','state'])['state_pop'].mean()

region              state               
East North Central  Illinois                12723071
                    Indiana                  6695497
                    Michigan                 9984072
                    Ohio                    11676341
                    Wisconsin                5807406
East South Central  Alabama                  4887681
                    Kentucky                 4461153
                    Mississippi              2981020
                    Tennessee                6771631
Mid-Atlantic        New Jersey               8886025
                    New York                19530351
                    Pennsylvania            12800922
Mountain            Arizona                  7158024
                    Colorado                 5691287
                    Idaho                    1750536
                    Montana                  1060665
                    Nevada                   3027341
                    New Mexico               2092741
     