# 10 min to pandas

In [1]:
import numpy as np

import pandas as pd

## Object Creation

* Creating Series by passing a list of values

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## Creating DataFrame by passing a NumPy array, with datetime index and labeled columns

In [4]:
dates = pd.date_range('20191220', periods=6)
dates

DatetimeIndex(['2019-12-20', '2019-12-21', '2019-12-22', '2019-12-23',
               '2019-12-24', '2019-12-25'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-12-20,0.557917,-1.47783,-0.29527,0.368942
2019-12-21,-1.103909,-0.664318,2.827123,0.358238
2019-12-22,-0.347208,-1.47169,0.419071,0.536951
2019-12-23,1.250362,-0.609198,0.480425,-0.491609
2019-12-24,-0.178046,-0.235816,1.651153,-0.789384
2019-12-25,-0.508748,-0.715985,-1.202888,-0.920789


### Creating a DataFrame by passing a dict of objects that can be converted to series-like

In [7]:
df2 = pd.DataFrame({'A':1.,
                                    'B': pd.Timestamp('20191220'),
                                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                                     'D': np.array([3]*4, dtype='int32'),
                                     'E': pd.Categorical(["test","train", "test","train"]),
                                     'F':'foo'})

In [8]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2019-12-20,1.0,3,test,foo
1,1.0,2019-12-20,1.0,3,train,foo
2,1.0,2019-12-20,1.0,3,test,foo
3,1.0,2019-12-20,1.0,3,train,foo


##### The columns of the resulting DataFrame have different dtypes.

In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [12]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [11]:
df2.abs

<bound method NDFrame.abs of      A          B    C  D      E    F
0  1.0 2019-12-20  1.0  3   test  foo
1  1.0 2019-12-20  1.0  3  train  foo
2  1.0 2019-12-20  1.0  3   test  foo
3  1.0 2019-12-20  1.0  3  train  foo>

In [13]:
df2.add

<bound method _arith_method_FRAME.<locals>.f of      A          B    C  D      E    F
0  1.0 2019-12-20  1.0  3   test  foo
1  1.0 2019-12-20  1.0  3  train  foo
2  1.0 2019-12-20  1.0  3   test  foo
3  1.0 2019-12-20  1.0  3  train  foo>

#  View Data

In [14]:
df.head()

Unnamed: 0,A,B,C,D
2019-12-20,0.557917,-1.47783,-0.29527,0.368942
2019-12-21,-1.103909,-0.664318,2.827123,0.358238
2019-12-22,-0.347208,-1.47169,0.419071,0.536951
2019-12-23,1.250362,-0.609198,0.480425,-0.491609
2019-12-24,-0.178046,-0.235816,1.651153,-0.789384


In [15]:
df.tail()

Unnamed: 0,A,B,C,D
2019-12-21,-1.103909,-0.664318,2.827123,0.358238
2019-12-22,-0.347208,-1.47169,0.419071,0.536951
2019-12-23,1.250362,-0.609198,0.480425,-0.491609
2019-12-24,-0.178046,-0.235816,1.651153,-0.789384
2019-12-25,-0.508748,-0.715985,-1.202888,-0.920789


In [16]:
df.index

DatetimeIndex(['2019-12-20', '2019-12-21', '2019-12-22', '2019-12-23',
               '2019-12-24', '2019-12-25'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

Fordf, our DataFrame of all floating-point values, DataFrame.to_numpy() is fast and doesnt require copying data

In [18]:
df.to_numpy()

array([[ 0.55791657, -1.47782965, -0.2952703 ,  0.36894162],
       [-1.10390945, -0.66431824,  2.82712347,  0.35823796],
       [-0.34720751, -1.47168992,  0.41907056,  0.53695081],
       [ 1.25036179, -0.60919783,  0.48042548, -0.49160905],
       [-0.17804631, -0.23581635,  1.65115339, -0.78938354],
       [-0.50874806, -0.71598461, -1.20288782, -0.92078889]])

For df2, theDataFrame with multiple dtypes,DataFrame.to_numpy() is relatively expensive.

In [19]:
df2.to_numpy()

array([[1.0, Timestamp('2019-12-20 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2019-12-20 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2019-12-20 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2019-12-20 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

Note: DataFrame.to_numpy() does not include the index or column labels in the output

## describe() shows a quick statistic summary of your data

In [20]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.054939,-0.862473,0.646602,-0.156275
std,0.835016,0.503464,1.424845,0.650983
min,-1.103909,-1.47783,-1.202888,-0.920789
25%,-0.468363,-1.282764,-0.116685,-0.71494
50%,-0.262627,-0.690151,0.449748,-0.066686
75%,0.373926,-0.622978,1.358471,0.366266
max,1.250362,-0.235816,2.827123,0.536951


# Transposing your data:

In [21]:
df.T

Unnamed: 0,2019-12-20,2019-12-21,2019-12-22,2019-12-23,2019-12-24,2019-12-25
A,0.557917,-1.103909,-0.347208,1.250362,-0.178046,-0.508748
B,-1.47783,-0.664318,-1.47169,-0.609198,-0.235816,-0.715985
C,-0.29527,2.827123,0.419071,0.480425,1.651153,-1.202888
D,0.368942,0.358238,0.536951,-0.491609,-0.789384,-0.920789


### Sorting by an axis

In [22]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2019-12-20,0.368942,-0.29527,-1.47783,0.557917
2019-12-21,0.358238,2.827123,-0.664318,-1.103909
2019-12-22,0.536951,0.419071,-1.47169,-0.347208
2019-12-23,-0.491609,0.480425,-0.609198,1.250362
2019-12-24,-0.789384,1.651153,-0.235816,-0.178046
2019-12-25,-0.920789,-1.202888,-0.715985,-0.508748


## Sorting by values:

In [23]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2019-12-20,0.557917,-1.47783,-0.29527,0.368942
2019-12-22,-0.347208,-1.47169,0.419071,0.536951
2019-12-25,-0.508748,-0.715985,-1.202888,-0.920789
2019-12-21,-1.103909,-0.664318,2.827123,0.358238
2019-12-23,1.250362,-0.609198,0.480425,-0.491609
2019-12-24,-0.178046,-0.235816,1.651153,-0.789384
