In [2]:
import numpy as np
import pandas as pd

# 1. Series

* A data type that is comprised of index and values
* A Series object contains elements of single data type
* Series is the data structure for a single column of a DataFrame

### 1.1 Create Series

In [3]:
#passing in only values automatically creates an index from 0 as default
data = pd.Series(np.random.randint(10, size=5))
data

0    7
1    1
2    6
3    3
4    3
dtype: int64

In [9]:
#setting index
data = pd.Series(np.random.randint(10, size=5), 
                index=list("ABCDE"))
data

A    9
B    1
C    8
D    6
E    2
dtype: int64

In [10]:
data.index, data.values

(Index(['A', 'B', 'C', 'D', 'E'], dtype='object'), array([9, 1, 8, 6, 2]))

### 1.2 Offset Index

* You can use index names to access specific elements in a Series object
* Works very similar to numpy arrays

In [11]:
#indexing specific index
data["B"], data.B

(1, 1)

In [12]:
#replacing values
data["C"] = 10
data

A     9
B     1
C    10
D     6
E     2
dtype: int64

In [13]:
#broadcasting
data * 10

A     90
B     10
C    100
D     60
E     20
dtype: int64

In [14]:
#selecting multiple indices
data[["B","E"]]

B    1
E    2
dtype: int64

In [15]:
#offset index
data[2::2]

C    10
E     2
dtype: int64

In [16]:
#offset index
data[::-1]

E     2
D     6
C    10
B     1
A     9
dtype: int64

### 1.3 Series Operation

* Series operations are conducted for elements in Series objets that share the same index values
* Index is preserved, that is, for index where values don't exist in BOTH of Seris objects, we get NULL values

In [17]:
data

A     9
B     1
C    10
D     6
E     2
dtype: int64

In [18]:
data2 = pd.Series({"D":3, "E":5, "F":7})
data2

D    3
E    5
F    7
dtype: int64

In [19]:
result = data + data2
result # None

A    NaN
B    NaN
C    NaN
D    9.0
E    7.0
F    NaN
dtype: float64

# 2. DataFrame

* Comprised of multiple Series
* Values in same columns share identical data types

### 2.1 Creating Dataframes

In [22]:
#using a dictionary
#key values become column names
#creating by column
datas = {
    "name":["dss", "fcamp"],
    "email":["dss@gmail.com", "fcamp@daum.net"],
}

df = pd.DataFrame(datas)
df

Unnamed: 0,name,email
0,dss,dss@gmail.com
1,fcamp,fcamp@daum.net


In [24]:
#using a list
#key values become column names
#creating by row. Each dictionary within a list becomes a row
datas = [
    {"name":"dss", "email":"dss@gmail.com"},
    {"name":"fcamp", "email":"fcamp@daum.net"},
]

df = pd.DataFrame(datas)
df

Unnamed: 0,name,email
0,dss,dss@gmail.com
1,fcamp,fcamp@daum.net


### 2.2 Adding Index

In [25]:
#set index
df = pd.DataFrame(datas, index=["one", "two"])
df

Unnamed: 0,name,email
one,dss,dss@gmail.com
two,fcamp,fcamp@daum.net


### 2.3 Handling DataFrames

In [26]:
df.index

Index(['one', 'two'], dtype='object')

In [27]:
df.columns

Index(['name', 'email'], dtype='object')

In [28]:
df.values

array([['dss', 'dss@gmail.com'],
       ['fcamp', 'fcamp@daum.net']], dtype=object)

### 2.4 Accessing Elements

In [29]:
df = pd.DataFrame(datas)
df

Unnamed: 0,name,email
0,dss,dss@gmail.com
1,fcamp,fcamp@daum.net


In [31]:
#select row: loc
#this is also a Series object
df.loc[1]

name              fcamp
email    fcamp@daum.net
Name: 1, dtype: object

In [32]:
#therefore you can access specific element
df.loc[1]["email"]

'fcamp@daum.net'

In [33]:
#you can add a row
#if the index already exists then you are editing the row values
#if the index doesn't exist, then you are inserting a new row
df.loc[2] = {"name": "andy", "email": "andy@naver.com"}
df

Unnamed: 0,name,email
0,dss,dss@gmail.com
1,fcamp,fcamp@daum.net
2,andy,andy@naver.com


In [34]:
#select column
df["name"]

0      dss
1    fcamp
2     andy
Name: name, dtype: object

In [35]:
#insert column
df["id"] = [1, 2, 3]
df

Unnamed: 0,name,email,id
0,dss,dss@gmail.com,1
1,fcamp,fcamp@daum.net,2
2,andy,andy@naver.com,3


In [37]:
#select row and column
df.loc[[0, 2], ["email", "id"]]

Unnamed: 0,email,id
0,dss@gmail.com,1
2,andy@naver.com,3


### 2.5 Apply Function

* To every element of a Series object, you can apply a function

In [40]:
def domain(email):
    return email.split("@")[1].split(".")[0]

domain(df.loc[0]["email"])

'gmail'

In [41]:
#this apply function is function of Series object
df["domain"] = df["email"].apply(domain)
df

Unnamed: 0,name,email,id,domain
0,dss,dss@gmail.com,1,gmail
1,fcamp,fcamp@daum.net,2,daum
2,andy,andy@naver.com,3,naver


In [42]:
#using lambda
df["domain_new"] = df["email"].apply(lambda email: email.split("@")[1].split(".")[0])
df

Unnamed: 0,name,email,id,domain,domain_new
0,dss,dss@gmail.com,1,gmail,gmail
1,fcamp,fcamp@daum.net,2,daum,daum
2,andy,andy@naver.com,3,naver,naver


# 3. DataFrame Practice

In [118]:
from makedata import *

df1 = pd.DataFrame(make_data(5))
df2 = pd.DataFrame(make_data(5))
df2

Unnamed: 0,Age,Name
0,20,Arnold
1,30,Alvin
2,24,Anchal
3,25,Billy
4,24,Anchal


### 3.1 Append Dataframes

* Append vertically
* Resulting dataframe inherits index from previous dataframes

In [55]:
df3 = df1.append(df2)
df3.head()

Unnamed: 0,Age,Name
0,36,Jin
1,32,Alvin
2,38,Alvin
3,34,Andrew
4,39,Billy


In [54]:
#reset index
df3.reset_index(drop=True, inplace=True)
df3.head()

Unnamed: 0,Age,Name
0,36,Jin
1,32,Alvin
2,38,Alvin
3,34,Andrew
4,39,Billy


In [53]:
#Or you can adjust the setting during appending procedure
df3 = df1.append(df2, ignore_index=True)
df3.head()

Unnamed: 0,Age,Name
0,36,Jin
1,32,Alvin
2,38,Alvin
3,34,Andrew
4,39,Billy


### 3.2 Concatenate Dataframes

* Allows you to combine dataframes both vertically and horizontally

In [57]:
df1.head()

Unnamed: 0,Age,Name
0,36,Jin
1,32,Alvin
2,38,Alvin
3,34,Andrew
4,39,Billy


In [58]:
df2.head()

Unnamed: 0,Age,Name
0,31,Jin
1,27,Anthony
2,37,Anthony
3,27,Jin
4,30,Anchal


In [61]:
#you need to pass in the dataframes as a list
#default concat axis = 0
df3 = pd.concat([df1, df2]).reset_index(drop = True)
df3

Unnamed: 0,Age,Name
0,36,Jin
1,32,Alvin
2,38,Alvin
3,34,Andrew
4,39,Billy
5,31,Jin
6,27,Anthony
7,37,Anthony
8,27,Jin
9,30,Anchal


In [62]:
#vertically, default is join outer
pd.concat([df3, df1], axis=1)

Unnamed: 0,Age,Name,Age.1,Name.1
0,36,Jin,36.0,Jin
1,32,Alvin,32.0,Alvin
2,38,Alvin,38.0,Alvin
3,34,Andrew,34.0,Andrew
4,39,Billy,39.0,Billy
5,31,Jin,,
6,27,Anthony,,
7,37,Anthony,,
8,27,Jin,,
9,30,Anchal,,


In [63]:
pd.concat([df3, df1], axis=1, join='inner')

Unnamed: 0,Age,Name,Age.1,Name.1
0,36,Jin,36,Jin
1,32,Alvin,32,Alvin
2,38,Alvin,38,Alvin
3,34,Andrew,34,Andrew
4,39,Billy,39,Billy


# 4. Group By

In [64]:
df = pd.DataFrame(make_data())
df

Unnamed: 0,Age,Name
0,21,Alan
1,29,Jin
2,37,Anthony
3,38,Anthony
4,33,Anthony
5,22,Alex
6,39,Alvin
7,40,Adam
8,37,Billy
9,38,Alex


In [68]:
# size
result_df = df.groupby("Name").size().reset_index(name="count")
result_df

Unnamed: 0,Name,count
0,Adam,1
1,Alan,1
2,Alex,2
3,Alvin,1
4,Anthony,3
5,Billy,1
6,Jin,1


In [73]:
#sort values
result_df.sort_values(["count", "Name"], ascending=False, inplace=True)
result_df.reset_index(drop=True, inplace=True)
result_df

Unnamed: 0,Name,count
0,Anthony,3
1,Alex,2
2,Jin,1
3,Billy,1
4,Alvin,1
5,Alan,1
6,Adam,1


In [81]:
#aggregate groupby
df.groupby(["Name"]).agg({"Age": ['min', 'max']}).reset_index()

Unnamed: 0_level_0,Name,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max
0,Adam,40,40
1,Alan,21,21
2,Alex,22,38
3,Alvin,39,39
4,Anthony,33,38
5,Billy,37,37
6,Jin,29,29


# 5. Practice

* New concept: fill NA

In [119]:
#create random dataframes
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])

for idx in range(1, 9):

    name = get_name()
    while name in list(user_df["Name"]):
        name = get_name()

    data = {"Name": name, "UserID": idx, "Age": get_age()}

    user_df.loc[len(user_df)] = data
    
#rename column
user_df.rename(columns={"UserID":"ID"}, inplace=True)


# 딕셔너리 데이터를 데이터 프레임에 하나씩 추가하기
money_df = pd.DataFrame(columns=["ID", "Money"])
# np.random.randint(1, 9)
for _ in range(15):
    money_df.loc[len(money_df)] = {
        "ID": np.random.randint(1, 9),
        "Money": np.random.randint(1, 21) * 1000,
    }

In [120]:
result_df = pd.merge(money_df, user_df)
result_df.tail()

Unnamed: 0,ID,Money,Name,Age
10,8,11000,Billy,28
11,8,14000,Billy,28
12,7,6000,Jin,32
13,7,17000,Jin,32
14,7,3000,Jin,32


In [121]:
#aggregate groupby
money_list = result_df[["Name", "Money"]].groupby(["Name"])["Money"].sum().reset_index()
money_list.head()

Unnamed: 0,Name,Money
0,Adam,23000
1,Alan,10000
2,Alex,17000
3,Anchal,8000
4,Anthony,9000


In [122]:
#get full list of names
result = pd.merge(user_df, money_list, how="outer")
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alex,25,17000.0
1,2,Adam,21,23000.0
2,3,Andrew,22,
3,4,Anchal,34,8000.0
4,5,Anthony,23,9000.0
5,6,Alan,31,10000.0
6,7,Jin,32,26000.0
7,8,Billy,28,46000.0


In [123]:
#fillna: replaces NaN
result.fillna(value=0, inplace=True)
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alex,25,17000.0
1,2,Adam,21,23000.0
2,3,Andrew,22,0.0
3,4,Anchal,34,8000.0
4,5,Anthony,23,9000.0
5,6,Alan,31,10000.0
6,7,Jin,32,26000.0
7,8,Billy,28,46000.0


In [124]:
#change money data types
result.dtypes

ID         int64
Name      object
Age        int64
Money    float64
dtype: object

In [125]:
result['Money'] = result['Money'].astype("int")
result.head()

Unnamed: 0,ID,Name,Age,Money
0,1,Alex,25,17000
1,2,Adam,21,23000
2,3,Andrew,22,0
3,4,Anchal,34,8000
4,5,Anthony,23,9000


# 6. Pandas Input/Output

In [126]:
# load
titanic = pd.read_csv("datas/train.csv")
titanic.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q
