# 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

## Object Creation

Create a pandas series

In [7]:
s = pd.Series([4, 2, 6, np.nan, 3, 42])
s

0     4.0
1     2.0
2     6.0
3     NaN
4     3.0
5    42.0
dtype: float64

Create a DataFrame with a numpy array

In [8]:
dates = pd.date_range("20200101", periods=6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2020-01-01,1.262603,-0.558691,-0.018945,0.858592
2020-01-02,0.612334,1.923736,0.605236,0.690957
2020-01-03,-0.184938,0.57534,2.495507,0.146377
2020-01-04,-1.427069,0.748504,-0.259419,-0.953936
2020-01-05,-0.378069,1.372641,0.796222,-1.014413
2020-01-06,0.419307,0.166089,0.458951,-1.157991


Create a DataFrame with a dictionary

In [25]:
df2 = pd.DataFrame(
    {
        "A": 1.0, 
        "B": pd.date_range("20200101", periods=4), 
        "C": pd.Series(1, index=list(range(4)), dtype="float32"), 
        "D": np.array([3] * 4, dtype="int32"), 
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
1,1.0,2020-01-02,1.0,3,train,foo
2,1.0,2020-01-03,1.0,3,test,foo
3,1.0,2020-01-04,1.0,3,train,foo


Each series (column) can have a different datatype

In [26]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## View and inspect data with .head, .tail, .index, .columns, .info, and .describe

In [27]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
1,1.0,2020-01-02,1.0,3,train,foo
2,1.0,2020-01-03,1.0,3,test,foo
3,1.0,2020-01-04,1.0,3,train,foo


In [28]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
2,1.0,2020-01-03,1.0,3,test,foo
3,1.0,2020-01-04,1.0,3,train,foo


In [29]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [30]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [31]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [32]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


.to_numpy() casts all the data in the dataframe to a numpy array. Since ndarrays need homogeneous data types whereas a DataFrame just needs homogeneous datatypes for each column, type conversion will take place as needed. Additionally, .to_numpy() does not preserve columns or indices

In [33]:
nparray = df2.to_numpy()
nparray

array([[1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2020-01-03 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-04 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

## Transposing and sorting

- .T = transposes rows and columns
- .sort_values = sort dataframe based on column specified in "By" argument (ascending by default, use `ascending=False` to get a descending list

In [34]:
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2020-01-01 00:00:00,2020-01-02 00:00:00,2020-01-03 00:00:00,2020-01-04 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [35]:
df2.sort_values(by="E")

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
2,1.0,2020-01-03,1.0,3,test,foo
1,1.0,2020-01-02,1.0,3,train,foo
3,1.0,2020-01-04,1.0,3,train,foo


In [37]:
# Use an array to sort by multiple columns 
df2.sort_values(by=["E", "B"])

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
2,1.0,2020-01-03,1.0,3,test,foo
1,1.0,2020-01-02,1.0,3,train,foo
3,1.0,2020-01-04,1.0,3,train,foo
