In [87]:
import pandas as pd
import numpy as np

# Data Aggregation and Group Operations

## DATASET

In [88]:
apps = pd.read_csv('googleplaystore.csv')
apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## GroupBy Mechanics

In [89]:
## We are asking for the rating mean **BY** Category
grouped = apps.groupby('Category') 
pd.DataFrame(grouped.Rating.mean())

Unnamed: 0_level_0,Rating
Category,Unnamed: 1_level_1
1.9,19.0
ART_AND_DESIGN,4.358065
AUTO_AND_VEHICLES,4.190411
BEAUTY,4.278571
BOOKS_AND_REFERENCE,4.346067
BUSINESS,4.121452
COMICS,4.155172
COMMUNICATION,4.158537
DATING,3.970769
EDUCATION,4.389032


In [90]:
## We are asking for the rating mean **BY** Category and LastUpdate
grouped = apps.groupby(['Category','Last Updated']) 
by2Filters = pd.DataFrame(grouped.Rating.mean())
by2Filters

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating
Category,Last Updated,Unnamed: 2_level_1
1.9,1.0.19,19.000000
ART_AND_DESIGN,"April 15, 2018",3.800000
ART_AND_DESIGN,"April 18, 2018",5.000000
ART_AND_DESIGN,"April 2, 2018",4.350000
ART_AND_DESIGN,"April 20, 2018",4.000000
ART_AND_DESIGN,"April 25, 2018",4.800000
ART_AND_DESIGN,"April 26, 2018",3.800000
ART_AND_DESIGN,"April 27, 2018",4.700000
ART_AND_DESIGN,"August 1, 2018",4.700000
ART_AND_DESIGN,"August 2, 2018",4.700000


In [127]:
#by2Filters has a multiindex
by2Filters.index;

## Grouping using a dictionary

In [92]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-0.562252,0.078472,0.093803,-1.193015,-0.12591
Steve,-0.906517,-1.555484,-0.753413,1.371429,0.7902
Wes,-1.092129,,,-2.016845,-0.508921
Jim,1.474689,-0.071412,-0.840946,1.211097,-2.433369
Travis,-1.065591,0.519519,-0.2202,0.777803,1.13364


In [93]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [94]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.099212,-0.60969
Steve,0.618016,-1.671801
Wes,-2.016845,-1.60105
Jim,0.370151,-1.030092
Travis,0.557603,0.587568


## Grouping using a dictionary

In [95]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## Grouping by functions

In [135]:
apps.groupby(lambda x: 'Good App' if apps['Rating'].loc[x] > 3  else 'Bad App').ngroups

2

In [96]:
for dtype, group in apps.groupby(lambda x: 'Good App' if apps['Rating'].loc[x] > 3  else 'Bad App'):
    print(dtype)
    print(group)

Bad App
                                                     App             Category  \
23                                Mcqueen Coloring pages       ART_AND_DESIGN   
113                            Wrinkles and rejuvenation               BEAUTY   
123                               Manicure - nail design               BEAUTY   
126                         Skin Care and Natural Beauty               BEAUTY   
129                  Secrets of beauty, youth and health               BEAUTY   
130                   Recipes and tips for losing weight               BEAUTY   
134                        Lady adviser (beauty, health)               BEAUTY   
163                           Anonymous caller detection  BOOKS_AND_REFERENCE   
180                  SH-02J Owner's Manual (Android 8.0)  BOOKS_AND_REFERENCE   
185                       URBANO V 02 instruction manual  BOOKS_AND_REFERENCE   
227                                       Y! Mobile menu             BUSINESS   
321    【Ranobbe comp

In [97]:
# describe() performs a set of aggregation fuctions to all numeric columns
grouped = apps.groupby('Category') 
grouped.describe()

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1.9,1.0,19.0,,19.0,19.0,19.0,19.0,19.0
ART_AND_DESIGN,62.0,4.358065,0.358297,3.2,4.1,4.4,4.7,5.0
AUTO_AND_VEHICLES,73.0,4.190411,0.543692,2.1,4.0,4.3,4.6,4.9
BEAUTY,42.0,4.278571,0.362603,3.1,4.0,4.3,4.575,4.9
BOOKS_AND_REFERENCE,178.0,4.346067,0.429046,2.7,4.1,4.5,4.6,5.0
BUSINESS,303.0,4.121452,0.624422,1.0,3.9,4.3,4.5,5.0
COMICS,58.0,4.155172,0.537758,2.8,3.825,4.4,4.5,5.0
COMMUNICATION,328.0,4.158537,0.426192,1.0,4.0,4.3,4.4,5.0
DATING,195.0,3.970769,0.63051,1.0,3.7,4.1,4.4,5.0
EDUCATION,155.0,4.389032,0.251894,3.5,4.2,4.4,4.6,4.9


# Time series

## Date and Time Data Types and Tools

In [98]:
from datetime import datetime
now = datetime.now()
display(now)
now.year, now.month, now.day

datetime.datetime(2019, 5, 16, 14, 44, 3, 579796)

(2019, 5, 16)

In [99]:
delta = datetime(2019, 6, 1) - datetime(2018, 6, 1)
delta

datetime.timedelta(days=365)

In [100]:
from datetime import timedelta
start = datetime(2019, 1, 7)
start + timedelta(12)


datetime.datetime(2019, 1, 19, 0, 0)

In [144]:
stamp = datetime(year=2011, month=1, day=3)
stamp

datetime.datetime(2011, 1, 3, 0, 0)

In [None]:
stamp = datetime(2011, 1, 3)
str(stamp)
stamp.strftime('%Y-%m-%d')

In [102]:
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [103]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [104]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02    1.379887
2011-01-05   -1.497370
2011-01-07    0.136831
2011-01-08    0.131512
2011-01-10    1.011412
2011-01-12   -1.174474
dtype: float64

In [105]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [106]:
dup_ts.index.is_unique

False

In [107]:
pd.date_range('2019-01-01', '2019-02-2')

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12',
               '2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
               '2019-01-17', '2019-01-18', '2019-01-19', '2019-01-20',
               '2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24',
               '2019-01-25', '2019-01-26', '2019-01-27', '2019-01-28',
               '2019-01-29', '2019-01-30', '2019-01-31', '2019-02-01',
               '2019-02-02'],
              dtype='datetime64[ns]', freq='D')

In [108]:
pd.date_range(start='2012-04-01', periods=2)

DatetimeIndex(['2012-04-01', '2012-04-02'], dtype='datetime64[ns]', freq='D')

In [109]:
pd.date_range('2000-01-01', '2000-01-02 23:59', freq='4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

## Timestamp

Pandas replacement for datetime.datetime  
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [137]:
pd.Timestamp(2017, 1, 1, 12)

Timestamp('2017-01-01 12:00:00')

In [111]:
pd.Timestamp(year=2017, month=1, day=1, hour=12)

Timestamp('2017-01-01 12:00:00')

In [112]:
delta = pd.Timestamp(2019, 1, 1) - pd.Timestamp(2019, 1, 2)
display(delta.days)
delta

-1

Timedelta('-1 days +00:00:00')

In [113]:
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'),datetime(2018, 1, 1)])
dti   

DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)

In [114]:
today = pd.Timestamp('2019-05-17')
display(today)
today.day_name()

Timestamp('2019-05-17 00:00:00')

'Friday'

In [115]:
tomorrow = today + pd.Timedelta('1 day')
tomorrow.day_name()

'Saturday'

In [116]:
monday = pd.Timestamp('2019-05-17') + pd.offsets.BDay()
monday.day_name()

'Monday'

In [117]:
pd.Timestamp('2019-05-17') + pd.DateOffset(1)

Timestamp('2019-05-18 00:00:00')

In [118]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],'day': [4, 5],'hour': [2, 3]})
pd.to_datetime(df)

0   2015-02-04 02:00:00
1   2016-03-05 03:00:00
dtype: datetime64[ns]

In [119]:
cr = pd.Timestamp.now(tz='America/Costa_Rica')
ny = pd.Timestamp.now(tz='America/New_York')
display(cr.strftime('%Y-%m-%d %H:%M'))
display(ny.strftime('%Y-%m-%d %H:%M'))

'2019-05-16 14:44'

'2019-05-16 16:44'

In [120]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
cal.holidays()

DatetimeIndex(['1970-01-01', '1970-02-16', '1970-05-25', '1970-07-03',
               '1970-09-07', '1970-10-12', '1970-11-11', '1970-11-26',
               '1970-12-25', '1971-01-01',
               ...
               '2030-01-01', '2030-01-21', '2030-02-18', '2030-05-27',
               '2030-07-04', '2030-09-02', '2030-10-14', '2030-11-11',
               '2030-11-28', '2030-12-25'],
              dtype='datetime64[ns]', length=594, freq=None)

In [121]:
#Boolean comp
pd.Timestamp.now() < pd.Timestamp('1996-06-01')

False

# Challenges

## Day of my birthday

In [122]:
pd.Timestamp('1996-06-01').day_name() 

'Saturday'

## Days from birthday

In [123]:
days = pd.Timestamp.now() - pd.Timestamp('1996-06-01')
days

Timedelta('8384 days 14:44:04.723386')

## How many days have you been working at GFT (Monday-Friday)

In [124]:
from pandas.tseries.offsets import *
pd.date_range('2019-01-01', '2019-01-31', freq=BDay()).day.size

23

In [141]:
pd.Timestamp.now() 

Timestamp('2019-05-16 16:58:27.286543')