In [83]:
import numpy as np
import pandas as pd

# Series

In [84]:
colors = ['red', 'green', 'blue', 'orange', 'yellow', 'white', 'black', 'pink', 'orange', 'yellow', 'blue']
numbers = [ 1 , 2, 3, 4, 5, 6, 7, 8, 9, 3, 2, 1, 4, 8, 9]
ser_color = pd.Series( colors )
ser_numbers = pd.Series(numbers)

In [85]:
ser_color

0        red
1      green
2       blue
3     orange
4     yellow
5      white
6      black
7       pink
8     orange
9     yellow
10      blue
dtype: object

In [86]:
ser_numbers

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9     3
10    2
11    1
12    4
13    8
14    9
dtype: int64

In [87]:
ser_color.shape

(11,)

In [88]:
ser_color.ndim

1

In [89]:
  ser_color.isnull()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [90]:
type(ser_color.apply(lambda x : x[0]))

pandas.core.series.Series

In [91]:
type(ser_color.transform(lambda x : x[0]))

pandas.core.series.Series

In [92]:
index = ser_color.apply(lambda x: x[0])
ser_color.index = index

In [93]:
ser_color

r       red
g     green
b      blue
o    orange
y    yellow
w     white
b     black
p      pink
o    orange
y    yellow
b      blue
dtype: object

In [94]:
ser_color.reset_index(name='Color') # Note this returns a Dataframe where the index becomes a column

Unnamed: 0,index,Color
0,r,red
1,g,green
2,b,blue
3,o,orange
4,y,yellow
5,w,white
6,b,black
7,p,pink
8,o,orange
9,y,yellow


In [95]:
# positional access (numpy style)
ser_color[1:3] #OR ser_color.iloc[1:3]

g    green
b     blue
dtype: object

In [96]:
# index label access
ser_color['r']

'red'

In [97]:
# multiple
ser_color[['r', 'o']]

r       red
o    orange
o    orange
dtype: object

In [98]:
ser_color.head()

r       red
g     green
b      blue
o    orange
y    yellow
dtype: object

In [99]:
ser_color.tail()

b     black
p      pink
o    orange
y    yellow
b      blue
dtype: object

In [100]:
ser_color.take([1,3,6])

g     green
o    orange
b     black
dtype: object

  <i>Series Support broadcast when binary operations with scalar and elementwise with another series</i>

In [101]:
ser_numbers * 2

0      2
1      4
2      6
3      8
4     10
5     12
6     14
7     16
8     18
9      6
10     4
11     2
12     8
13    16
14    18
dtype: int64

In [102]:
# filter elements using conditions
ser_numbers[ser_numbers > 5]

5     6
6     7
7     8
8     9
13    8
14    9
dtype: int64

In [103]:
#unique elements
ser_color.unique()

array(['red', 'green', 'blue', 'orange', 'yellow', 'white', 'black', 'pink'], dtype=object)

In [104]:
# Group and do aggregate count on each value
ser_color.value_counts()

yellow    2
orange    2
blue      2
black     1
red       1
white     1
pink      1
green     1
dtype: int64

In [105]:
# total elements
ser_color.count()

11

In [106]:
# Drop certain lables
ser_color.drop('y')

r       red
g     green
b      blue
o    orange
w     white
b     black
p      pink
o    orange
b      blue
dtype: object

In [107]:
ser_numbers.describe()

count    15.000000
mean      4.800000
std       2.858571
min       1.000000
25%       2.500000
50%       4.000000
75%       7.500000
max       9.000000
dtype: float64

In [108]:
# fill missing values
pd.Series([1, 2, np.nan, 8]).interpolate()

0    1.0
1    2.0
2    5.0
3    8.0
dtype: float64

In [109]:
ser1 = pd.Series([1, 4, 7, np.nan, 10, np.nan])
ser1.fillna(ser1.mean()) # fill missing values with the mean

0     1.0
1     4.0
2     7.0
3     5.5
4    10.0
5     5.5
dtype: float64

In [110]:
# Aggregate over entire series to give one value
ser_numbers.sum()

72

In [111]:
ser_numbers.max()

9

In [112]:
# Cummulative Aggregation to give another series
ser_numbers.cumsum()

0      1
1      3
2      6
3     10
4     15
5     21
6     28
7     36
8     45
9     48
10    50
11    51
12    55
13    63
14    72
dtype: int64

In [113]:
ser_numbers.cummax()

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9     9
10    9
11    9
12    9
13    9
14    9
dtype: int64

In [114]:
#Sorting by value
ser_numbers.sort_values()

0     1
11    1
1     2
10    2
2     3
9     3
3     4
12    4
4     5
5     6
6     7
7     8
13    8
8     9
14    9
dtype: int64

In [115]:
# sort by index
ser_color.sort_index()

b      blue
b     black
b      blue
g     green
o    orange
o    orange
p      pink
r       red
w     white
y    yellow
y    yellow
dtype: object

In [116]:
ser_color.append(pd.Series(['black', 'white'], index=['b','w']))

r       red
g     green
b      blue
o    orange
y    yellow
w     white
b     black
p      pink
o    orange
y    yellow
b      blue
b     black
w     white
dtype: object

In [117]:
''' TIME SERIES'''
# Create a date time index based for a specific period and specific frequency (eq: 5H, 10M etc..)
pd.date_range(start='1/1/2017', end='31/1/2017', freq='D')

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
               '2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
               '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
               '2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
               '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
               '2017-01-29', '2017-01-30', '2017-01-31'],
              dtype='datetime64[ns]', freq='D')

In [118]:
# Create fixed number of samples based on start, frequency
pd.date_range(start=pd.datetime.now(), periods=10, freq='H')

DatetimeIndex(['2017-08-25 17:12:29.427459', '2017-08-25 18:12:29.427459',
               '2017-08-25 19:12:29.427459', '2017-08-25 20:12:29.427459',
               '2017-08-25 21:12:29.427459', '2017-08-25 22:12:29.427459',
               '2017-08-25 23:12:29.427459', '2017-08-26 00:12:29.427459',
               '2017-08-26 01:12:29.427459', '2017-08-26 02:12:29.427459'],
              dtype='datetime64[ns]', freq='H')

In [119]:
ser_numbers.count()

15

In [120]:
time_index = pd.date_range(start=pd.datetime.now(), periods=ser_numbers.count(), freq='D')

In [121]:
ser_numbers.index = time_index
ser_numbers

2017-08-25 17:12:29.702514    1
2017-08-26 17:12:29.702514    2
2017-08-27 17:12:29.702514    3
2017-08-28 17:12:29.702514    4
2017-08-29 17:12:29.702514    5
2017-08-30 17:12:29.702514    6
2017-08-31 17:12:29.702514    7
2017-09-01 17:12:29.702514    8
2017-09-02 17:12:29.702514    9
2017-09-03 17:12:29.702514    3
2017-09-04 17:12:29.702514    2
2017-09-05 17:12:29.702514    1
2017-09-06 17:12:29.702514    4
2017-09-07 17:12:29.702514    8
2017-09-08 17:12:29.702514    9
Freq: D, dtype: int64

# DataFrame

In [122]:
# use an existing datasets
import seaborn as sns

In [123]:
football = pd.read_csv('football.csv')
tips = sns.load_dataset('tips')

In [124]:
football

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,NODATA,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [125]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [126]:
# Setting index
football = football.set_index('game_id') # have to put inplace otherwise
football.head()

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,11.0,5.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,9.0,3.0
128.0,2012.0,Bears,10.0,6.0


In [127]:
# row labels
football.index

Float64Index([100.0,   nan, 114.0, 145.0, 128.0, 167.0, 142.0, 187.0, 157.0,
              187.0, 171.0, 185.0, 200.0],
             dtype='float64', name='game_id')

In [129]:
# Column labels
football.columns

Index(['year', 'team', 'wins', 'losses'], dtype='object')

In [130]:
football.shape

(13, 4)

In [131]:
football.ndim

2

In [132]:
football.size

52

In [133]:
football.isnull()

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,False,False,False,False
,False,False,False,False
114.0,False,False,False,False
145.0,False,False,False,False
128.0,False,False,False,False
167.0,True,False,False,False
142.0,False,False,False,False
187.0,False,False,False,True
157.0,False,False,False,False
187.0,False,False,True,False


In [137]:
# Dataframes support broadcast with scalars and with other dataframes (element wise ) 
# but columns must be numeric for math ops
tips[['tip', 'total_bill']][:5] * 2

Unnamed: 0,tip,total_bill
0,2.02,33.98
1,3.32,20.68
2,7.0,42.02
3,6.62,47.36
4,7.22,49.18


In [138]:
# NOTE: some operations only apply to numeric columns
tips.mean()

total_bill    19.785943
tip            2.998279
size           2.569672
dtype: float64

In [139]:
# NOTE: the sum applies to even string columns (in which case it concatanates it)
# NOTE: Also the nan values are ignored for numeric columns when calculating. 
# index is not considered for the operation
football.sum()

year                                                  24133
team      BearsRocketsBearsNODATABearsRocketsPackersRock...
wins                                                    122
losses                                                   70
dtype: object

In [141]:
football.median() # Note the 'team' column is ignored here.

year      2011.0
wins        10.5
losses       5.5
dtype: float64

In [144]:
football.max()

year         2012
team      Rockets
wins           15
losses         12
dtype: object

In [145]:
football.cummax()

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,11.0,5.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Rockets,13.0,8.0
145.0,2012.0,Rockets,13.0,8.0
128.0,2012.0,Rockets,13.0,8.0
167.0,,Rockets,13.0,8.0
142.0,2012.0,Rockets,15.0,8.0
187.0,2012.0,Rockets,15.0,
157.0,2012.0,Rockets,15.0,8.0
187.0,2012.0,Rockets,,8.0


In [146]:
# Note the nan values are ignored for numeric cols
football.count()

year      12
team      13
wins      12
losses    12
dtype: int64

In [147]:
# get index from column name
football.columns.get_loc('team')

1

In [148]:
# take
football.take([0,1,3])

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,11.0,5.0
,2010.0,Rockets,13.0,6.0
145.0,2012.0,NODATA,9.0,3.0


In [151]:
# applying custom function . Along a particular axis
football.apply(lambda x : x * 2)

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,4020.0,BearsBears,22.0,10.0
,4020.0,RocketsRockets,26.0,12.0
114.0,4022.0,BearsBears,16.0,16.0
145.0,4024.0,NODATANODATA,18.0,6.0
128.0,4024.0,BearsBears,20.0,12.0
167.0,,RocketsRockets,24.0,6.0
142.0,4022.0,PackersPackers,30.0,2.0
187.0,4024.0,RocketsRockets,26.0,
157.0,4024.0,PackersPackers,22.0,10.0
187.0,4020.0,PackersPackers,,10.0


In [152]:
# on select columns
football.loc[:, ['wins', 'losses']].apply(lambda x : x * 2)

Unnamed: 0_level_0,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100.0,22.0,10.0
,26.0,12.0
114.0,16.0,16.0
145.0,18.0,6.0
128.0,20.0,12.0
167.0,24.0,6.0
142.0,30.0,2.0
187.0,26.0,
157.0,22.0,10.0
187.0,,10.0


In [156]:
# applies a function element wise
tips.iloc[:5, :2].applymap(lambda x : x * 2)

Unnamed: 0,total_bill,tip
0,33.98,2.02
1,20.68,3.32
2,42.02,7.0
3,47.36,6.62
4,49.18,7.22


In [157]:
tips[:5]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [158]:
# Common statistics for numeric columns
football.describe()

Unnamed: 0,year,wins,losses
count,12.0,12.0,12.0
mean,2011.083333,10.166667,5.833333
std,0.900337,3.099365,3.040136
min,2010.0,4.0,1.0
25%,2010.0,8.75,4.5
50%,2011.0,10.5,5.5
75%,2012.0,12.25,6.5
max,2012.0,15.0,12.0


In [159]:
# information of column labels and types. Useful for dataframes with large number of columns
football.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 13 entries, 100.0 to 200.0
Data columns (total 4 columns):
year      12 non-null float64
team      13 non-null object
wins      12 non-null float64
losses    12 non-null float64
dtypes: float64(3), object(1)
memory usage: 840.0+ bytes


In [163]:
# Sorting

football.sort_values('wins',ascending=False).head(5) # top 5 wins

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
142.0,2011.0,Packers,15.0,1.0
,2010.0,Rockets,13.0,6.0
187.0,2012.0,Rockets,13.0,
167.0,,Rockets,12.0,3.0
100.0,2010.0,Bears,11.0,5.0


In [167]:
tips.iloc[:5,:]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [175]:
# Creating pivot table
# NOTE the column specified as index must have unique values
# This will not do any aggregation but instead just layout the column and index in a matrix form with corresponding
# value if exists in the cell.
tips.iloc[:5,:].pivot( index='total_bill', columns='tip', values='sex')

tip,1.01,1.66,3.31,3.5,3.61
total_bill,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.34,,Male,,,
16.99,Female,,,,
21.01,,,,Male,
23.68,,,Male,,
24.59,,,,,Female


In [176]:
# Creates a pivot table by aggregating values (according to specified function) 
# for each unique combination of index and column
pd.pivot_table(data=tips,  index='sex', columns='smoker', values='tip', aggfunc=np.median)

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.0,2.74
Female,2.88,2.68


In [None]:
''' Time Series'''