# Pandas
- Pandas is developed based on numpy
- Pandas is better to analyze and manipulate the dataset

## How to generate data using Pandas

In [1]:
import pandas as pd

### Series
**pd.Series(sequence data)**

In [2]:
a1 = pd.Series([10,20,30,40,50])

In [3]:
a1

0    10
1    20
2    30
3    40
4    50
dtype: int64

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [4]:
a1.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
a1.values

array([10, 20, 30, 40, 50], dtype=int64)

In [6]:
a2 = pd.Series(['a','b','c',1,2,3])

In [7]:
a2

0    a
1    b
2    c
3    1
4    2
5    3
dtype: object

![image.png](attachment:image.png)

In [8]:
a2.index

RangeIndex(start=0, stop=6, step=1)

In [9]:
a2.values

array(['a', 'b', 'c', 1, 2, 3], dtype=object)

## missing value

In [10]:
import numpy as np
s3 = pd.Series([np.nan, 10, 30])

In [11]:
s3

0     NaN
1    10.0
2    30.0
dtype: float64

![image.png](attachment:image.png)

## DataFrame (2-dimensional data)
**df = pd.DataFrame(data, index=index_data, columns=columns_data)**  
**index, column, values**

In [12]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

In [13]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [14]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [15]:
df.columns

RangeIndex(start=0, stop=3, step=1)

In [16]:
pd.DataFrame(np.arange(0,9).reshape(3,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [17]:
# ndarray
np.arange(9).reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [18]:
type(np.arange(9).reshape(3,3))

numpy.ndarray

In [19]:
# pandas
pd.DataFrame(np.arange(9).reshape(3,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [20]:
type(pd.DataFrame(np.arange(9).reshape(3,3)))

pandas.core.frame.DataFrame

## change the name of index and columns

In [21]:
pd.DataFrame(np.arange(9).reshape(3,3), index = ['a','b','c'], columns = ['x','y','z'])

Unnamed: 0,x,y,z
a,0,1,2
b,3,4,5
c,6,7,8


- list, ndarray → pandas
- dictionary → pandas

In [22]:
table = {'year':[2015,2016,2017,2018],
        'sales':[200,250,450,300]}

In [23]:
type(table)

dict

In [24]:
pd.DataFrame(table)

Unnamed: 0,year,sales
0,2015,200
1,2016,250
2,2017,450
3,2018,300


![image.png](attachment:image.png)

## operation

In [25]:
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series([10,20,30,40,50])

In [26]:
s1+s2

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [27]:
s1-s2

0    -9
1   -18
2   -27
3   -36
4   -45
dtype: int64

In [28]:
s1/s2

0    0.1
1    0.1
2    0.1
3    0.1
4    0.1
dtype: float64

In [29]:
table_2={"A":[1,2,3],
        "B":[10,20,30],
        "C":[100,200,300]}

In [30]:
pd.DataFrame(table_2)

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300


In [31]:
table_3={"A":[6,7],
        "B":[60,70],
        "C":[600,700]}

In [32]:
pd.DataFrame(table_3)

Unnamed: 0,A,B,C
0,6,60,600
1,7,70,700


In [33]:
df2=pd.DataFrame(table_2)
df3=pd.DataFrame(table_3)

In [34]:
df2.shape

(3, 3)

In [35]:
df3.shape

(2, 3)

In [36]:
df2+df3

Unnamed: 0,A,B,C
0,7.0,70.0,700.0
1,9.0,90.0,900.0
2,,,


In [37]:
df2-df3

Unnamed: 0,A,B,C
0,-5.0,-50.0,-500.0
1,-5.0,-50.0,-500.0
2,,,


In [38]:
df2*df3

Unnamed: 0,A,B,C
0,6.0,600.0,60000.0
1,14.0,1400.0,140000.0
2,,,


In [39]:
df2/df3

Unnamed: 0,A,B,C
0,0.166667,0.166667,0.166667
1,0.285714,0.285714,0.285714
2,,,


## descriptive statistics

In [40]:
table_4={'Spring':[256,264,215,312],
        'Summer':[770,567,599,387],
        'Fall':[363,231,293,247],
        'Winter':[139,59,76,109]}

In [41]:
pd.DataFrame(table_4, index=['2012','2013','2014','2015'], columns=['Spring','Summer','Fall','Winter'])

Unnamed: 0,Spring,Summer,Fall,Winter
2012,256,770,363,139
2013,264,567,231,59
2014,215,599,293,76
2015,312,387,247,109


In [42]:
df_5=pd.DataFrame(table_4, index=['2012','2013','2014','2015'], columns=['Spring','Summer','Fall','Winter'])

### sum for each column

In [43]:
df_5.sum()

Spring    1047
Summer    2323
Fall      1134
Winter     383
dtype: int64

### sum for each index

In [44]:
df_5.sum(axis=1)

2012    1528
2013    1121
2014    1183
2015    1055
dtype: int64

### mean for each column

In [45]:
df_5.mean()

Spring    261.75
Summer    580.75
Fall      283.50
Winter     95.75
dtype: float64

### mean for each index

In [46]:
df_5.mean(axis=1)

2012    382.00
2013    280.25
2014    295.75
2015    263.75
dtype: float64

### Standard deviation for each column

In [47]:
df_5.std()

Spring     39.785885
Summer    156.925407
Fall       59.157981
Winter     35.528158
dtype: float64

### Standard deviation for each index

In [48]:
df_5.std(axis=1)

2012    274.365936
2013    211.239793
2014    221.192789
2015    117.964331
dtype: float64

### max/min for each column

In [49]:
df_5.mean()

Spring    261.75
Summer    580.75
Fall      283.50
Winter     95.75
dtype: float64

In [50]:
df_5.max()

Spring    312
Summer    770
Fall      363
Winter    139
dtype: int64

## descriptive statistics

In [51]:
df_5.describe()

Unnamed: 0,Spring,Summer,Fall,Winter
count,4.0,4.0,4.0,4.0
mean,261.75,580.75,283.5,95.75
std,39.785885,156.925407,59.157981,35.528158
min,215.0,387.0,231.0,59.0
25%,245.75,522.0,243.0,71.75
50%,260.0,583.0,270.0,92.5
75%,276.0,641.75,310.5,116.5
max,312.0,770.0,363.0,139.0


In [52]:
type(df_5.describe())

pandas.core.frame.DataFrame

In [53]:
df_5.describe().shape

(8, 4)