# Series

## Making series

In [1]:
import pandas as pd

# from dictionary
dict_data = {'a': 1, 'b': 2, 'c':3}
series = pd.Series(dict_data)

print(series)

a    1
b    2
c    3
dtype: int64


In [2]:
type(series)

pandas.core.series.Series

In [3]:
series.index

Index(['a', 'b', 'c'], dtype='object')

In [4]:
series.values

array([1, 2, 3], dtype=int64)

딕셔너리가 아닌 리스트를 통해 시리즈를 만들 수도 있다.

In [5]:
list_data = ['a', 'b', 'c']
series_2 = pd.Series(list_data)

print(series_2)

0    a
1    b
2    c
dtype: object


In [6]:
series_3 = pd.Series(list_data, index=['index1', 'index2', 'index3'])
print(series_3)

index1    a
index2    b
index3    c
dtype: object


## Select from a series

In [7]:
capital = pd.Series({'Korea': 'Seoul',
                     'Japan': 'Tokyo',
                     'China': 'Beijing',
                     'India': 'New Delhi',
                     'Taiwan': 'Taipei',
                     'Singapore': 'Singapore'})

print(capital)

Korea            Seoul
Japan            Tokyo
China          Beijing
India        New Delhi
Taiwan          Taipei
Singapore    Singapore
dtype: object


In [9]:
capital['Korea']

'Seoul'

In [12]:
capital[['Korea', 'Taiwan']]

Korea      Seoul
Taiwan    Taipei
dtype: object

In [13]:
capital[0]

'Seoul'

In [16]:
capital[[0, 3]]

Korea        Seoul
India    New Delhi
dtype: object

In [19]:
capital[0:3]

Korea      Seoul
Japan      Tokyo
China    Beijing
dtype: object

## Calculate serieses

In [20]:
series_1 = pd.Series([1, 2, 3])
series_2 = pd.Series([4, 5, 6])

series_1 + series_2

0    5
1    7
2    9
dtype: int64

In [22]:
series_1 * 2

0    2
1    4
2    6
dtype: int64

# DataFrame

## Making DataFrames

In [24]:
dict_data = {'col1': [1,2,3], 'col2': [4,5,6], 'col3': [7,8,9]}
df = pd.DataFrame(dict_data)

df

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,9


In [25]:
type(df)

pandas.core.frame.DataFrame

In [26]:
df2 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

df2

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [28]:
df3 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]],
                   index=['index1', 'index2', 'index3'],
                   columns=['col1', 'col2', 'col3'])
df3

Unnamed: 0,col1,col2,col3
index1,1,2,3
index2,4,5,6
index3,7,8,9


In [29]:
# delete a row and a column from DataFrame.
df3.drop('index3', axis=0, inplace=True)
df3.drop('col1', axis=1, inplace=True)

df3

Unnamed: 0,col2,col3
index1,2,3
index2,5,6


## Select rows and columns

In [31]:
dict_data = {'col1': [1,2,3,4], 'col2': [5,6,7,8],
             'col3': [9,10,11,12], 'col4': [13,14,15,16]}
df = pd.DataFrame(dict_data, index=['index1', 'index2', 'index3', 'index4'])

df

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15
index4,4,8,12,16


In [32]:
df['col1']

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [33]:
df.col1

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [34]:
# make it as a dataframe
df[['col1']]

Unnamed: 0,col1
index1,1
index2,2
index3,3
index4,4


In [35]:
df[['col1', 'col2']]

Unnamed: 0,col1,col2
index1,1,5
index2,2,6
index3,3,7
index4,4,8


In [36]:
# DataFrame.loc['row index']
# DataFrame.iloc[location index]

df.loc['index1']

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [37]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [38]:
# return as a dataframe type
df.loc[['index1']]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [39]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [40]:
df.iloc[[0]]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [42]:
df.loc['index1':'index3']

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15


In [43]:
df.iloc[0:2]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14


In [46]:
df.loc['index1', 'col1']

1

In [47]:
df.loc[['index1', 'index3'], ['col1', 'col4']]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


In [48]:
df.loc['index1':'index2', 'col1':'col3']

Unnamed: 0,col1,col2,col3
index1,1,5,9
index2,2,6,10


In [49]:
df.iloc[[0, 2], [0, 3]]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


# Explore and analysis data

In [50]:
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [51]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [52]:
df.shape

(891, 15)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [54]:
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [55]:
df[['sex', 'survived']].value_counts()

sex     survived
male    0           468
female  1           233
male    1           109
female  0            81
Name: count, dtype: int64

In [57]:
# ratio of counts
df[['sex', 'survived']].value_counts(normalize=True).sort_index()

sex     survived
female  0           0.090909
        1           0.261504
male    0           0.525253
        1           0.122334
Name: proportion, dtype: float64

In [60]:
df['survived'].mean()

0.3838383838383838

In [62]:
df[['survived', 'age']].mean()

survived     0.383838
age         29.699118
dtype: float64

In [63]:
df['fare'].min()

0.0

In [64]:
df['fare'].max()

512.3292

In [65]:
df['fare'].mean()

32.204207968574636

In [66]:
df['fare'].median()

14.4542

# Manage missing data

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [68]:
df.head().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


## drop the missing values

In [69]:
# 결측치가 있는 경우 행을 모두 삭제
df.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [70]:
# dropna() 메서드 내에 subset을 입력하면 해당 열 중에서 결측치가 있는 경우 행을 삭제
# axis=0은 행 방향으로 동작을 의미
df.dropna(subset= ['age'], axis=0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [71]:
# 결측치가 있는 열을 삭제
df.dropna(axis=1)

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,1,0,7.2500,Third,man,True,no,False
1,1,1,female,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Second,man,True,no,True
887,1,1,female,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,0,0,30.0000,First,man,True,yes,True


In [72]:
# thres=300 : 결측치가 300개 이상 갖는 열을 삭제
df.dropna(axis=1, thresh=300)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


## Replace the missing values

In [74]:
df_2 = df.copy()
df_2.head(6)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True


In [75]:
mean_age = df_2['age'].mean()
print(mean_age)

29.69911764705882


In [76]:
# 결측치 특정 값을 대체
df_2['age'].fillna(mean_age, inplace=True)

In [78]:
df_2['age'].head(6)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
Name: age, dtype: float64

In [79]:
df_2['embark_town'].fillna('Southampton', inplace=True)

In [80]:
# forward fill and backward fill
df_2['deck_ffill'] = df_2['deck'].fillna(method='ffill')
df_2['deck_bfill'] = df_2['deck'].fillna(method='bfill')

df_2[['deck', 'deck_ffill', 'deck_bfill']].head(12)

Unnamed: 0,deck,deck_ffill,deck_bfill
0,,,C
1,C,C,C
2,,C,C
3,C,C,C
4,,C,E
5,,C,E
6,E,E,E
7,,E,G
8,,E,G
9,,E,G


# Indexing