In [1]:
# 對文字變數重新編碼
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
position_dict = {
    "Ron Harper" : "PG",
    "Michael Jrdan" : "SG",
    "Scottie Pippen" : "SF",
    "Dennis Rodman" : "PF",
    "Luc Longley" : "C"
}
#使用.map()中輸入dict，利用dict中的key值去mapping
# 這邊是將position_dict的player當作key值，把key值mapping後面的值放到df中position欄位
df["position"] = df["player"].map(position_dict)
df

Unnamed: 0,number,player,position
0,9,Ron Harper,PG
1,23,Michael Jrdan,SG
2,33,Scottie Pippen,SF
3,91,Dennis Rodman,PF
4,13,Luc Longley,C


In [2]:
# 跟上面的df做比較
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df

Unnamed: 0,number,player
0,9,Ron Harper
1,23,Michael Jrdan
2,33,Scottie Pippen
3,91,Dennis Rodman
4,13,Luc Longley


In [3]:
# 再多一個欄位
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
position_dict = {
    "Ron Harper" : "PG",
    "Michael Jrdan" : "SG",
    "Scottie Pippen" : "SF",
    "Dennis Rodman" : "PF",
    "Luc Longley" : "C"
}

df["position"] = df["player"].map(position_dict)

court_dict = {
    "PG" : "Back",
    "SG" : "Back",
    "SF" : "Front",
    "PF" : "Front",
    "C" : "Front"
}

df["court"] = df["position"].map(court_dict)
df

Unnamed: 0,number,player,position,court
0,9,Ron Harper,PG,Back
1,23,Michael Jrdan,SG,Back
2,33,Scottie Pippen,SF,Front
3,91,Dennis Rodman,PF,Front
4,13,Luc Longley,C,Front


In [4]:
# 另外一種寫法，使用lambda
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
position_dict = {
    "Ron Harper" : "PG",
    "Michael Jrdan" : "SG",
    "Scottie Pippen" : "SF",
    "Dennis Rodman" : "PF",
    "Luc Longley" : "C"
}

df["position"] = df["player"].map(position_dict)

df["court"] = df["position"].map(lambda x : "Back" if x in ["PG", "SG"] else "Front")
df

Unnamed: 0,number,player,position,court
0,9,Ron Harper,PG,Back
1,23,Michael Jrdan,SG,Back
2,33,Scottie Pippen,SF,Front
3,91,Dennis Rodman,PF,Front
4,13,Luc Longley,C,Front


In [6]:
# 對數字做重新歸類分組為文字變數 (dummy)
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
weights = [185, 195, 210, 210, 265]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["weight"] = weights
df

Unnamed: 0,number,player,weight
0,9,Ron Harper,185
1,23,Michael Jrdan,195
2,33,Scottie Pippen,210
3,91,Dennis Rodman,210
4,13,Luc Longley,265


In [8]:
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
weights = [185, 195, 210, 210, 265]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["weight"] = weights

# def 開始做定義! 有點像是R的function，通常wt是放x拉....
def get_weight_category(wt): 
    if wt < 200:
        return "Light"
    elif 200 <= wt < 250:
        return "Medium"
    else :
        return "Heavy"

df["weight_category"] = df["weight"].map(get_weight_category)
df

Unnamed: 0,number,player,weight,weight_category
0,9,Ron Harper,185,Light
1,23,Michael Jrdan,195,Light
2,33,Scottie Pippen,210,Medium
3,91,Dennis Rodman,210,Medium
4,13,Luc Longley,265,Heavy


In [11]:
# 處理遺漏值(這很重要)
import pandas as pd
import numpy as np
numbers = [9, 23, 33, 91, 13,7]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley", "Toni Kukoc"]
colleges = ["Miami University", "University of North Carolina", "University of Central Arkansas", "Southeastern Oklahoma State University", "University of New Mexico",
           None]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["college"] = colleges
df

Unnamed: 0,number,player,college
0,9,Ron Harper,Miami University
1,23,Michael Jrdan,University of North Carolina
2,33,Scottie Pippen,University of Central Arkansas
3,91,Dennis Rodman,Southeastern Oklahoma State University
4,13,Luc Longley,University of New Mexico
5,7,Toni Kukoc,


In [12]:
# 篩選遺漏值
# 判斷大學是否有遺漏值
print(df["college"].isna())

0    False
1    False
2    False
3    False
4    False
5     True
Name: college, dtype: bool


In [13]:
# 篩選出大學為遺漏值的列數
df[df["college"].isna()]

Unnamed: 0,number,player,college
5,7,Toni Kukoc,


In [14]:
# 反向作業! 找出那些不是na值的
print(df["college"].notna())

0     True
1     True
2     True
3     True
4     True
5    False
Name: college, dtype: bool


In [15]:
df[df["college"].notna()]

Unnamed: 0,number,player,college
0,9,Ron Harper,Miami University
1,23,Michael Jrdan,University of North Carolina
2,33,Scottie Pippen,University of Central Arkansas
3,91,Dennis Rodman,Southeastern Oklahoma State University
4,13,Luc Longley,University of New Mexico


In [16]:
# 填補遺失值 (這個更重要了)
import pandas as pd
import numpy as np
numbers = [9, 23, 33, 91, 13,7]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley", "Toni Kukoc"]
colleges = ["Miami University", "University of North Carolina", "University of Central Arkansas", "Southeastern Oklahoma State University", "University of New Mexico",
           None]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["college"] = colleges

df["college"] = df["college"].fillna("Croatia")
df

Unnamed: 0,number,player,college
0,9,Ron Harper,Miami University
1,23,Michael Jrdan,University of North Carolina
2,33,Scottie Pippen,University of Central Arkansas
3,91,Dennis Rodman,Southeastern Oklahoma State University
4,13,Luc Longley,University of New Mexico
5,7,Toni Kukoc,Croatia


In [1]:
# 處理時間序列
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
birth_dates = ["January 20, 1964", "February 17, 1963", "September 25, 1965", "May 13, 1961", "January 19, 1969"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["birth_date"] = birth_dates
print(df["birth_date"].dtype)
df

object


Unnamed: 0,number,player,birth_date
0,9,Ron Harper,"January 20, 1964"
1,23,Michael Jrdan,"February 17, 1963"
2,33,Scottie Pippen,"September 25, 1965"
3,91,Dennis Rodman,"May 13, 1961"
4,13,Luc Longley,"January 19, 1969"


In [2]:
# 使用 pandas 的 to_datatime()
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jrdan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
birth_dates = ["January 20, 1964", "February 17, 1963", "September 25, 1965", "May 13, 1961", "January 19, 1969"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df["birth_date"] = birth_dates
df["birth_date"] = pd.to_datetime(df["birth_date"]) #把文字轉換成日期格式
print(df["birth_date"].dtype)
df

datetime64[ns]


Unnamed: 0,number,player,birth_date
0,9,Ron Harper,1964-01-20
1,23,Michael Jrdan,1963-02-17
2,33,Scottie Pippen,1965-09-25
3,91,Dennis Rodman,1961-05-13
4,13,Luc Longley,1969-01-19
