In [1]:
import pandas as pd

# Pandas Series

### Creating a series using specified index

In [2]:
series = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
series

a    1
b    2
c    3
d    4
e    5
dtype: int64

### Accessing elements in a series

In [4]:
# Accesing 2nd element of series
series[2]

3

In [5]:
series.iloc[2]

3

In [7]:
# Accessing element using index
series['d']

4

In [9]:
series.loc['d']

4

In [10]:
# Slicing a series
series[2:]

c    3
d    4
e    5
dtype: int64

In [11]:
series.iloc[2:]

c    3
d    4
e    5
dtype: int64

### Specifying a Datetime Range as the Index of a Series

In [12]:
dates1 = pd.date_range('20190525', periods = 12)
dates1

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='D')

In [13]:
series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
series.index = dates1
series

2019-05-25     1
2019-05-26     2
2019-05-27     3
2019-05-28     4
2019-05-29     5
2019-05-30     6
2019-05-31     7
2019-06-01     8
2019-06-02     9
2019-06-03    10
2019-06-04    11
2019-06-05    12
Freq: D, dtype: int64

### Date Ranges

In [14]:
# Freq = 'M for monthly date
dates2 = pd.date_range('2019-05-25', periods = 12, freq='M')
dates2

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')

In [15]:
dates3 = pd.date_range('2019/05/17 09:00:00', periods=8, freq='H')
dates3

DatetimeIndex(['2019-05-17 09:00:00', '2019-05-17 10:00:00',
               '2019-05-17 11:00:00', '2019-05-17 12:00:00',
               '2019-05-17 13:00:00', '2019-05-17 14:00:00',
               '2019-05-17 15:00:00', '2019-05-17 16:00:00'],
              dtype='datetime64[ns]', freq='H')

# Pandas DataFrame

In [16]:
import numpy as np

### Creating a DataFrame

In [17]:
df1 = pd.DataFrame(np.random.randn(10, 4),
                   columns = list('ABCD'))
df1

Unnamed: 0,A,B,C,D
0,1.65891,0.036481,-0.930187,-0.794702
1,1.518128,1.145521,-1.137798,0.012993
2,0.498187,1.4516,-0.240187,0.620057
3,-0.237613,1.317595,0.501615,-0.951008
4,0.461311,-1.593641,1.186713,-0.082356
5,-1.96464,1.424528,0.099544,-0.734051
6,0.01404,-0.302484,0.881812,0.273333
7,0.042931,-0.05421,-0.979875,-0.243054
8,-0.457548,-0.130179,-0.137647,-0.692998
9,0.237225,0.591064,-0.784836,-1.685372


### Specifying the Index in a DataFrame

In [18]:
days = pd.date_range('20190525', periods=10)
df1.index = days
df1

Unnamed: 0,A,B,C,D
2019-05-25,1.65891,0.036481,-0.930187,-0.794702
2019-05-26,1.518128,1.145521,-1.137798,0.012993
2019-05-27,0.498187,1.4516,-0.240187,0.620057
2019-05-28,-0.237613,1.317595,0.501615,-0.951008
2019-05-29,0.461311,-1.593641,1.186713,-0.082356
2019-05-30,-1.96464,1.424528,0.099544,-0.734051
2019-05-31,0.01404,-0.302484,0.881812,0.273333
2019-06-01,0.042931,-0.05421,-0.979875,-0.243054
2019-06-02,-0.457548,-0.130179,-0.137647,-0.692998
2019-06-03,0.237225,0.591064,-0.784836,-1.685372


In [19]:
# Accessing index of dataframe
df1.index

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03'],
              dtype='datetime64[ns]', freq='D')

In [20]:
# Accessing values of dataframe
df1.values

array([[ 1.65891039,  0.03648109, -0.93018729, -0.79470238],
       [ 1.51812786,  1.14552129, -1.1377979 ,  0.01299322],
       [ 0.49818663,  1.45159954, -0.24018683,  0.62005706],
       [-0.23761287,  1.31759451,  0.50161464, -0.95100819],
       [ 0.46131107, -1.59364074,  1.1867134 , -0.08235619],
       [-1.96463994,  1.42452814,  0.09954448, -0.73405079],
       [ 0.01404033, -0.30248417,  0.88181181,  0.27333342],
       [ 0.04293137, -0.05421045, -0.97987456, -0.24305423],
       [-0.45754751, -0.13017948, -0.13764675, -0.69299754],
       [ 0.23722532,  0.59106402, -0.78483609, -1.68537163]])

### Generating descriptive statics on dataframe

In [22]:
df1.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.177093,0.388627,-0.154085,-0.427716
std,1.021329,0.981843,0.818031,0.674234
min,-1.96464,-1.593641,-1.137798,-1.685372
25%,-0.1747,-0.111187,-0.893849,-0.779539
50%,0.140078,0.313773,-0.188917,-0.468026
75%,0.488968,1.274576,0.401097,-0.010844
max,1.65891,1.4516,1.186713,0.620057


In [23]:
df1.mean()

A    0.177093
B    0.388627
C   -0.154085
D   -0.427716
dtype: float64

In [26]:
# Compute mean for each row
df1.mean(1)

2019-05-25   -0.007375
2019-05-26    0.384711
2019-05-27    0.582414
2019-05-28    0.157647
2019-05-29   -0.006993
2019-05-30   -0.293655
2019-05-31    0.216675
2019-06-01   -0.308552
2019-06-02   -0.354593
2019-06-03   -0.410480
Freq: D, dtype: float64

In [27]:
df1.min()

A   -1.964640
B   -1.593641
C   -1.137798
D   -1.685372
dtype: float64

### Adding & Removing Rows & Columns in a DataFrame

In [28]:
data = {'name': ['Sara', 'Ali', 'Fatima', 'Ahmed', 'Hamza'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [6, 13, 14, 1, 7]}

df2 = pd.DataFrame(data, index = ['Singapore', 'China', 'Japan', 'Sweden', 'Norway'])
df2

Unnamed: 0,name,year,reports
Singapore,Sara,2012,6
China,Ali,2012,13
Japan,Fatima,2013,14
Sweden,Ahmed,2014,1
Norway,Hamza,2014,7


In [29]:
# Adding a column
schools = np.array(["Cambridge", "Oxford", "Oxford", "Cambridge", "Oxford"])
df2['schools'] = schools
df2

Unnamed: 0,name,year,reports,schools
Singapore,Sara,2012,6,Cambridge
China,Ali,2012,13,Oxford
Japan,Fatima,2013,14,Oxford
Sweden,Ahmed,2014,1,Cambridge
Norway,Hamza,2014,7,Oxford


In [32]:
# Removing a row (outplace dropping)
df2.drop(['China'])

Unnamed: 0,name,year,reports,schools
Singapore,Sara,2012,6,Cambridge
Japan,Fatima,2013,14,Oxford
Sweden,Ahmed,2014,1,Cambridge
Norway,Hamza,2014,7,Oxford


In [33]:
# Drop row based on column value
df2[df2.name != 'Sara']

Unnamed: 0,name,year,reports,schools
China,Ali,2012,13,Oxford
Japan,Fatima,2013,14,Oxford
Sweden,Ahmed,2014,1,Cambridge
Norway,Hamza,2014,7,Oxford


In [36]:
# Remove row based on row number
df2.drop(df2.index[[1, 2]])

Unnamed: 0,name,year,reports,schools
Singapore,Sara,2012,6,Cambridge
Sweden,Ahmed,2014,1,Cambridge
Norway,Hamza,2014,7,Oxford


In [37]:
# Remove 2nd last row
df2.drop(df2.index[-2])

Unnamed: 0,name,year,reports,schools
Singapore,Sara,2012,6,Cambridge
China,Ali,2012,13,Oxford
Japan,Fatima,2013,14,Oxford
Norway,Hamza,2014,7,Oxford


In [38]:
# Removing a column
df2.drop('reports', axis=1)

Unnamed: 0,name,year,schools
Singapore,Sara,2012,Cambridge
China,Ali,2012,Oxford
Japan,Fatima,2013,Oxford
Sweden,Ahmed,2014,Cambridge
Norway,Hamza,2014,Oxford


In [39]:
df2.describe()

Unnamed: 0,year,reports
count,5.0,5.0
mean,2013.0,8.2
std,1.0,5.357238
min,2012.0,1.0
25%,2012.0,6.0
50%,2013.0,7.0
75%,2014.0,13.0
max,2014.0,14.0


In [43]:
df2.describe(include=['object'])

Unnamed: 0,name,schools
count,5,5
unique,5,2
top,Fatima,Oxford
freq,1,3


In [44]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Singapore to Norway
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     5 non-null      object
 1   year     5 non-null      int64 
 2   reports  5 non-null      int64 
 3   schools  5 non-null      object
dtypes: int64(2), object(2)
memory usage: 360.0+ bytes
