# **Pandas Lecture Scrapnote Part 1: Series and DataFrame(1-2)**
___

Documentation: [Pandas](https://pandas.pydata.org/docs/)

___

## **1. Series**

In [2]:
import numpy as np
import pandas as pd

In [5]:
my_labels = ['a','b','c']
my_data = [100,200,300]
arr = np.array(my_data)
d = {'a':100,'b':200,'c':300}

In [7]:
pd.Series(data = my_data)

0    100
1    200
2    300
dtype: int64

In [9]:
pd.Series(data=my_data,index=my_labels) #pd.Series(data,index)

a    100
b    200
c    300
dtype: int64

In [11]:
pd.Series(arr,my_labels)

a    100
b    200
c    300
dtype: int32

In [12]:
pd.Series(d)

a    100
b    200
c    300
dtype: int64

Key of using series: understand the index! Example:

In [16]:
ser1 = pd.Series([1,2,3,4],['USA','Indonesia','Japan','England'])
ser1

USA          1
Indonesia    2
Japan        3
England      4
dtype: int64

In [20]:
ser1['Japan']

3

In [18]:
ser2 = pd.Series([1,2,3,4],[12,13,14,15])
ser2

12    1
13    2
14    3
15    4
dtype: int64

In [21]:
ser2[13]

2

In [22]:
ser3 = pd.Series([10,12,13,14],['USA','Indonesia','Japan','England'])
ser3

USA          10
Indonesia    12
Japan        13
England      14
dtype: int64

In [23]:
ser4 = ser1+ser3

In [24]:
ser4

USA          11
Indonesia    14
Japan        16
England      18
dtype: int64

In [25]:
ser1+ser2

12          NaN
13          NaN
14          NaN
15          NaN
England     NaN
Indonesia   NaN
Japan       NaN
USA         NaN
dtype: float64

## **2. DataFrames Part 1 - 2**

In [36]:
# import random number, for data series demonstration only. nothing to do with pandas dataframe

from numpy.random import randn

np.random.seed(101)

In [37]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [38]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Grab Column(s)

##### Most Common Way

In [30]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [39]:
type(df['W'])  # confirming the type: series

pandas.core.series.Series

##### Alternative Way (But not recommended, because it may cause a confusion)

In [41]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

##### **Grab Multiple Columns**

In [43]:
df[['W','X']] # past a list of column names

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


### Create a New Column

In [44]:
df['new'] = df['W'] + df['Z']  # data value of 'new' column is the summation of data in W and Z columns

In [45]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


### Dropping a Certain Row or Column

Use a drop() function. By default, it drops only "temporarily"

##### ***Column Series***

In [46]:
# Dropping a column named 'new'

df.drop('new',axis=1) # doesn't actually affect the DataFrame

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [47]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


In [48]:
# Permanently drop a column. Use inplace argument

df.drop('new',axis=1,inplace=True)

In [49]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [50]:
df.drop('E') # default axis value is 0, which corresponds to row

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [51]:
# Check dataframe dimension

df.shape # (row, column)

(5, 4)

### Locating Data on Certain Row(s) and Column(s)

In [52]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


##### ***Row Series***

In [None]:
# name-based index

In [55]:
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [None]:
# numerical-based index

In [56]:
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [61]:
# return data on a certain (row,column)

In [62]:
df.loc['B','Y']

-0.8480769834036315

In [None]:
# return multiple data on multiple [[row list],[column list]]

In [59]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


### Conditional Selection on DataFrame

In [64]:
booldf = df > 0

In [65]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [66]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [67]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


##### Remove row(s) that doesn't meet the criteria based on the selected column reference

In [69]:
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [74]:
df[df['X']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [75]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


##### Return a certain column(s) based on the condition

In [76]:
df[df['X']>0]['X']

A    0.628133
C    0.740122
E    1.978757
Name: X, dtype: float64

In [77]:
df[df['X']>0][['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001
E,1.978757,0.683509


###### ***Breakdown Step of the Above Method***

In [84]:
boolser = df['W']>0
result = df[boolser]

In [85]:
result

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [86]:
result[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
D,0.188695,0.955057
E,0.190794,0.683509


##### Use multiple condition

Cannot use a normal 'and' and 'or' operator. Replace them with '&' or '|', respectively

In [89]:
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [90]:
df[(df['Y']>0) | (df['Z']<0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


#### Resetting index and set it to something else

In [92]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


##### Reset Index to a numerical index

In [93]:
df.reset_index()  # pass the inplace=True argument to reset dataframe index permanently

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


##### Replace index with the new one (from a list)

In [94]:
newind = 'ID JP US UK KR'.split()
newind

['ID', 'JP', 'US', 'UK', 'KR']

###### ***Assign a new column of index series***

In [95]:
df['Country'] = newind

In [96]:
df

Unnamed: 0,W,X,Y,Z,Country
A,2.70685,0.628133,0.907969,0.503826,ID
B,0.651118,-0.319318,-0.848077,0.605965,JP
C,-2.018168,0.740122,0.528813,-0.589001,US
D,0.188695,-0.758872,-0.933237,0.955057,UK
E,0.190794,1.978757,2.605967,0.683509,KR


###### ***Assign the newly-created index series as the new index for dataframe***

In [97]:
df.set_index('Country')  # pass the inplace=True argument to reset dataframe index permanently

Unnamed: 0_level_0,W,X,Y,Z
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ID,2.70685,0.628133,0.907969,0.503826
JP,0.651118,-0.319318,-0.848077,0.605965
US,-2.018168,0.740122,0.528813,-0.589001
UK,0.188695,-0.758872,-0.933237,0.955057
KR,0.190794,1.978757,2.605967,0.683509


***Finished!***