# Pandas
* open-source python data analysis library
* widely used for ML analysis (along with scikit-learn)
* **Series** & **DataFrame** objects are most widly used

In [2]:
import pandas as pd

># 1. Series
* ndarray + index

### generate pd series
* default: 0-based index
* .index & .values

In [7]:
s1 = pd.Series(list(range(1, 6)))
print(s1)
print("index: ", s1.index)
print("values: ", s1.values)

0    1
1    2
2    3
3    4
4    5
dtype: int64
index:  RangeIndex(start=0, stop=5, step=1)
values:  [1 2 3 4 5]


### customize index
* compatible with 0-based index (not when indices are customized with numbers)
* index 중복 가능 / 하지만 지양할 것
* chr : ASCII code $\rightarrow$ character
* ord : character $\rightarrow$ ASCII code

In [12]:
s3 = pd.Series(list(range(1, 4)), index = [chr(i) for i in range(97, 100)])
print(s3, s3['a'], s3[0], sep="\n")

a    1
b    2
c    3
dtype: int64
1
1


### re-use index
* index of a series can be re-used through .index attribute

In [13]:
s4 = pd.Series(2, index = s3.index)
print(s4)

a    2
b    2
c    2
dtype: int64


### generate pd series with dict

In [14]:
s5 = pd.Series({'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4})
print(s5)

a    1
b    2
c    3
d    4
dtype: int64


># 2. Series - operations

### basic attributes & operations

In [8]:
s = pd.Series([0, 1, 1, 2, 3, np.nan])
print(len(s))
print(s.size)
print(s.shape)
print(s.count()) # ignore NaN
print(s.unique()) # return ndarray with unique values
print(s.value_counts()) # ignore NaN / return pd series
type(s.value_counts())

6
6
(6,)
5
[  0.   1.   2.   3.  nan]
1.0    2
3.0    1
2.0    1
0.0    1
dtype: int64


pandas.core.series.Series

### head, tail, take
* head : print first n elements (default = 5)
* tail : print last n elements (default = 5)
* take : return elements matching given index list (only 0-based indexing allowed)

In [9]:
s = pd.Series(range(1,11))
print(s.take([0, 4, 3]))

0    1
4    5
3    4
dtype: int64


### single & multiple value accessing
* single: use index
  * default: 0-based index
  * `loc[]` : use index (인덱스 없는 경우 NaN)
  * `iloc[]` : use 0-based index (인덱스 없는 경우 error)
* multiple: use **list** of index

In [10]:
s = pd.Series([0, 1, 1, 2, 3, 4], index = ['a','b','c','d','e','f'])
print(s[0])
print(s.loc['a'])
print(s.iloc[0])
print(s[[4, 3, 2]])

0
0
0
e    3
d    2
c    1
dtype: int64


### numerical operations
* series & scalar: element-wise
* series & series: index-wise / non-matching index $\rightarrow$ NaN

In [11]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'e'])
s2 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
print(s1 + s2)

a    2.0
b    4.0
c    6.0
d    NaN
e    NaN
dtype: float64


### NaN
* default: ignore
* `skipna=False` $\rightarrow$ NaN not ignored

In [15]:
s = pd.Series([1, 2, 3, 4, np.NaN])
print(s.mean())
print(s.mean(skipna=False))

2.5
nan


### Boolean selection
* similar to ndarray boolean selection
* returns pd series

In [16]:
s = pd.Series(np.arange(1, 10))
s[s > 5]
s[(s % 2 == 0) & (s % 3 == 0)] # and
s[(s % 2 == 0) | (s % 3 == 0)] # or
s[s.index > 5]                 # condition for index

6    7
7    8
8    9
dtype: int64

In [17]:
print(s[s >= 7].sum()) # sum() is operated on series
print((s >= 7).sum())  # sum() is operated on Boolean list (True = 1, False = 0)

24
3


### updating series

In [18]:
s = pd.Series([1, 10, 100], index = ['a', 'b', 'c'])
s['d'] = 1000  # add value
s['d'] = 10000 # update value

### slicing

In [19]:
s = pd.Series(np.arange(100, 110), index=np.arange(10, 20))
s[0:5]
s[:5]
s[5:]
s[-3:]

17    107
18    108
19    109
dtype: int64

># 3. DataFrame
* 2-dimensional object / analogous to spreadsheet
* widely used in Data Analysis & ML $\rightarrow$ data: row / feature: column

### generating DataFrame
* **.shape** $\rightarrow$ returns shape

In [21]:
df1 = pd.DataFrame(np.array([[10, 11], [20, 22]]))

# stacking rows
df2 = pd.DataFrame([pd.Series(np.arange(10, 15)), pd.Series(np.arange(20, 25)), pd.Series(np.arange(20, 25))])

# stacking columns
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
df3 = pd.DataFrame({'c1': s1, 'c2': s2})

# customizing index & columns
df4 = pd.DataFrame(np.array([[22, 180], [23, 170]]), columns = ['Age', 'Height'], index = ['John', 'Bob'])

print(df1, df1.shape, df2, df2.shape, df3, df3.shape, df4, df4.shape, sep="\n\n")

    0   1
0  10  11
1  20  22

(2, 2)

    0   1   2   3   4
0  10  11  12  13  14
1  20  21  22  23  24
2  20  21  22  23  24

(3, 5)

   c1  c2
0   1   6
1   2   7
2   3   8
3   4   9
4   5  10

(5, 2)

      Age  Height
John   22     180
Bob    23     170

(2, 2)


### csv $\rightarrow$ dataframe
* most commonly used method to generate dataframe
* **index_col**: set index column (default: 0-based)
* **usecols**: select columns
* **header**: None $\rightarrow$ no header

In [24]:
sample_file = 'data/sample.csv'
sample_df1 = pd.read_csv(sample_file, header = None)
sample_df2 = pd.read_csv(sample_file, index_col = 'Symbol', usecols = [0, 1, 2, 3, 7]) 
sample_df3 = pd.read_csv(sample_file, index_col = 'Symbol', usecols=['Name', 'Symbol', 'Price'])

### indexing & slicing
* indexing: column
* slicing: row (**.loc** & **.iloc** can be used)

In [26]:
apple_df = pd.read_csv('data/apple.csv')
# indexing: column
apple_df['Date'] #return Series
apple_df[['Date']] #return DataFrame
apple_df[['Date', 'Close']] #return DataFrame
# slicing: row
apple_df[4:7]

Unnamed: 0,Date,Open,High,Low,Close,Volume
4,26-Sep-16,111.64,113.39,111.55,112.88,29869442
5,23-Sep-16,114.42,114.79,111.55,112.71,52481151
6,22-Sep-16,114.35,114.94,114.0,114.62,31073984


### head(), tail(), describe(), info()
* **.head()**: show first $n$ elements (default = 5)
* **.tail()**: show last $n$ elements (default = 5)
* **.describe()** & **.info()**: provide information about DF

In [27]:
apple_df.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,21.0,21.0,21.0,21.0,21.0
mean,110.673333,111.745238,109.807619,110.857143,46172820.0
std,3.807178,3.655234,3.703923,3.697138,23093820.0
min,102.65,105.72,102.53,103.13,24607410.0
25%,107.7,108.3,107.07,107.73,29869440.0
50%,112.46,113.37,111.55,112.71,36379110.0
75%,113.85,114.64,112.51,113.57,52481150.0
max,115.19,116.18,114.04,115.57,112340300.0


In [28]:
apple_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 6 columns):
Date      21 non-null object
Open      21 non-null float64
High      21 non-null float64
Low       21 non-null float64
Close     21 non-null float64
Volume    21 non-null int64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.1+ KB


### boolean selection

In [30]:
apple_df[(apple_df['High'] < 115) & (apple_df['Low'] >112)]
apple_df[(apple_df['High'] < 115) & (apple_df['Low'] >112)][['High','Low']]

Unnamed: 0,High,Low
2,114.64,113.43
3,113.18,112.34
6,114.94,114.0
7,113.99,112.44
8,114.12,112.51


### add/delete column

In [29]:
copy = apple_df.copy()
copy['Average'] = (copy['High'] + copy['Low'])/2 # add new column
copy.insert(1, 'Open*2', apple_df['Open'] * 2)   # add new column in designated index
del copy['Open*2']                               # delete column

### add/delete row

In [None]:
df1 = copy[-3:]
copy = copy.append(df1) # add row
copy = copy.drop([20])  # delete row
copy.tail()

># 4. DataFrame - operations

># 4.1. change index

### .set_index(keys, inplace)

In [None]:
#df.set_index('Winners', inplace=True) # inplace=True -> change the original 
df.set_index(['Winners', 'Runners-up'], inplace=True) # multiple index allowed
df.head()

### .reset_index()

In [None]:
df = df.reset_index()
df.head()

### multiple index - data selection

In [None]:
df.loc['Real Madrid']
df.loc['Real Madrid'].loc['Milan']
df.loc['Real Madrid', 'Milan']

># 4.2. group by

### group by

In [31]:
df = pd.read_csv('./data/euro_winners.csv')
df.head()

Unnamed: 0,Season,Nation,Winners,Score,Runners-up,Runner-UpNation,Venue,Attendance
0,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes,Paris",38239
1,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
2,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium,Brussels",67000
3,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion,Stuttgart",72000
4,1959–60,Spain,Real Madrid,7–3,Eintracht Frankfurt,Germany,"Hampden Park,Glasgow",127621


### .groups

In [32]:
nation_group = df.groupby('Nation')
nation_group.groups

{'England': Int64Index([12, 21, 22, 23, 24, 25, 26, 28, 43, 49, 52, 56], dtype='int64'),
 'France': Int64Index([37], dtype='int64'),
 'Germany': Int64Index([18, 19, 20, 27, 41, 45, 57], dtype='int64'),
 'Italy': Int64Index([7, 8, 9, 13, 29, 33, 34, 38, 40, 47, 51, 54], dtype='int64'),
 'Netherlands': Int64Index([14, 15, 16, 17, 32, 39], dtype='int64'),
 'Portugal': Int64Index([5, 6, 31, 48], dtype='int64'),
 'Romania': Int64Index([30], dtype='int64'),
 'Scotland': Int64Index([11], dtype='int64'),
 'Spain': Int64Index([0, 1, 2, 3, 4, 10, 36, 42, 44, 46, 50, 53, 55], dtype='int64'),
 'Yugoslavia': Int64Index([35], dtype='int64')}

In [33]:
len(nation_group.groups) # no. of groups

10

### .count()

In [34]:
nation_group.count()

Unnamed: 0_level_0,Season,Winners,Score,Runners-up,Runner-UpNation,Venue,Attendance
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
England,12,12,12,12,12,12,12
France,1,1,1,1,1,1,1
Germany,7,7,7,7,7,7,7
Italy,12,12,12,12,12,12,12
Netherlands,6,6,6,6,6,6,6
Portugal,4,4,4,4,4,4,4
Romania,1,1,1,1,1,1,1
Scotland,1,1,1,1,1,1,1
Spain,13,13,13,13,13,13,13
Yugoslavia,1,1,1,1,1,1,1


### .size()

In [35]:
nation_group.size()

Nation
England        12
France          1
Germany         7
Italy          12
Netherlands     6
Portugal        4
Romania         1
Scotland        1
Spain          13
Yugoslavia      1
dtype: int64

### .size().sort_values(ascending = False)

In [36]:
nation_group.size().sort_values(ascending = False)

Nation
Spain          13
Italy          12
England        12
Germany         7
Netherlands     6
Portugal        4
Yugoslavia      1
Scotland        1
Romania         1
France          1
dtype: int64

### multiple column grouping

In [37]:
club_group = df.groupby(['Nation', 'Winners'])
club_group.size().sort_values(ascending = False)

Nation       Winners          
Spain        Real Madrid          9
Italy        Milan                7
Germany      Bayern Munich        5
England      Liverpool            5
Spain        Barcelona            4
Netherlands  Ajax                 4
England      Manchester United    3
Italy        Internazionale       3
             Juventus             2
Portugal     Porto                2
             Benfica              2
England      Nottingham Forest    2
             Chelsea              1
France       Marseille            1
Yugoslavia   Red Star Belgrade    1
Germany      Borussia Dortmund    1
             Hamburg              1
Netherlands  Feyenoord            1
             PSV Eindhoven        1
Romania      Steaua Bucure?ti     1
Scotland     Celtic               1
England      Aston Villa          1
dtype: int64

># 4.3. group by - continued

In [39]:
df = pd.read_csv('./data/goal_stats_euro_leagues_2012-13.csv')
df.head()

Unnamed: 0,Month,Stat,EPL,La Liga,Serie A,Bundesliga
0,08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
1,09/01/2012,MatchesPlayed,38.0,39,50.0,44.0
2,10/01/2012,MatchesPlayed,31.0,31,39.0,27.0
3,11/01/2012,MatchesPlayed,50.0,41,42.0,46.0
4,12/01/2012,MatchesPlayed,59.0,39,39.0,26.0


### group by with function

In [42]:
df1 = df.set_index('Month')
year_group = df1.groupby(lambda month : month.split('/')[2])
year_group.head()

Unnamed: 0_level_0,Stat,EPL,La Liga,Serie A,Bundesliga
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
09/01/2012,MatchesPlayed,38.0,39,50.0,44.0
10/01/2012,MatchesPlayed,31.0,31,39.0,27.0
11/01/2012,MatchesPlayed,50.0,41,42.0,46.0
12/01/2012,MatchesPlayed,59.0,39,39.0,26.0
01/01/2013,MatchesPlayed,42.0,40,40.0,18.0
02/01/2013,MatchesPlayed,30.0,40,40.0,36.0
03/01/2013,MatchesPlayed,35.0,38,39.0,36.0
04/01/2013,MatchesPlayed,42.0,42,41.0,36.0
05/01/2013,MatchesPlayed,33.0,40,40.0,27.0


### mutiple index $\rightarrow$ grouping with level

In [43]:
df2 = df.set_index(['Month','Stat'])
year_group2 = df2.groupby(level = 1)
year_group3 = df2.groupby(level = 'Stat') # same operation

Unnamed: 0_level_0,Unnamed: 1_level_0,EPL,La Liga,Serie A,Bundesliga
Month,Stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
09/01/2012,MatchesPlayed,38.0,39,50.0,44.0
10/01/2012,MatchesPlayed,31.0,31,39.0,27.0
11/01/2012,MatchesPlayed,50.0,41,42.0,46.0
12/01/2012,MatchesPlayed,59.0,39,39.0,26.0
08/01/2012,GoalsScored,57.0,60,21.0,23.0
09/01/2012,GoalsScored,111.0,112,133.0,135.0
10/01/2012,GoalsScored,95.0,88,97.0,77.0
11/01/2012,GoalsScored,121.0,116,120.0,137.0
12/01/2012,GoalsScored,183.0,109,125.0,72.0


In [44]:
# multi level grouping
month_stat_group = df2.groupby(level = [0, 1])
for name, group in month_stat_group:
    print(name) # tuple of indices
    print(group) # data

('01/01/2013', 'GoalsScored')
                          EPL  La Liga  Serie A  Bundesliga
Month      Stat                                            
01/01/2013 GoalsScored  117.0      121    104.0        51.0
('01/01/2013', 'MatchesPlayed')
                           EPL  La Liga  Serie A  Bundesliga
Month      Stat                                             
01/01/2013 MatchesPlayed  42.0       40     40.0        18.0
('02/01/2013', 'GoalsScored')
                         EPL  La Liga  Serie A  Bundesliga
Month      Stat                                           
02/01/2013 GoalsScored  87.0      110    100.0       101.0
('02/01/2013', 'MatchesPlayed')
                           EPL  La Liga  Serie A  Bundesliga
Month      Stat                                             
02/01/2013 MatchesPlayed  30.0       40     40.0        36.0
('03/01/2013', 'GoalsScored')
                         EPL  La Liga  Serie A  Bundesliga
Month      Stat                                           
03/01

># 4.4 aggregation functions

### .sum()

In [45]:
stat_group = df2.groupby(level = 1)
stat_group.sum()
df_total = df2.sum(level = 'Stat') #same operation
df_total

Unnamed: 0_level_0,EPL,La Liga,Serie A,Bundesliga
Stat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GoalsScored,1063.0,1133,1003.0,898.0
MatchesPlayed,380.0,380,380.0,306.0


### aggregate methods

In [46]:
stat_group.sum()
stat_group.aggregate(len)
stat_group.aggregate(np.sum) # same as stat_group.sum()
stat_group.aggregate([np.sum, np.mean, np.size])

Unnamed: 0_level_0,EPL,EPL,EPL,La Liga,La Liga,La Liga,Serie A,Serie A,Serie A,Bundesliga,Bundesliga,Bundesliga
Unnamed: 0_level_1,sum,mean,size,sum,mean,size,sum,mean,size,sum,mean,size
Stat,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
GoalsScored,1063.0,106.3,11.0,1133,103.0,11,1003.0,100.3,11.0,898.0,89.8,11.0
MatchesPlayed,380.0,38.0,11.0,380,34.545455,11,380.0,38.0,11.0,306.0,30.6,11.0


### practice - calculating "goals / match"

In [47]:
goals_per_game_df = df_total.loc['GoalsScored'] / df_total.loc['MatchesPlayed']
goals_per_game_df = pd.DataFrame(goals_per_game_df).T.rename(index = {0: 'goals per game'})
goals_per_game_df

Unnamed: 0,EPL,La Liga,Serie A,Bundesliga
goals per game,2.797368,2.981579,2.639474,2.934641


In [48]:
df_total = df_total.append(goals_per_game_df)
pd.options.display.float_format='{:.2f}'.format

df_total

Unnamed: 0,EPL,La Liga,Serie A,Bundesliga
GoalsScored,1063.0,1133.0,1003.0,898.0
MatchesPlayed,380.0,380.0,380.0,306.0
goals per game,2.8,2.98,2.64,2.93


># 4.5. transform
* group by $\rightarrow$ manipulation $\rightarrow$ merge

In [None]:
df = pd.read_excel("data/sales_transactions.xlsx")

In [None]:
df.head()

In [None]:
df.groupby('order')["ext price"].sum()

### now, let's add this information to the original DF

In [None]:
order_total = df.groupby('order')["ext price"].sum().rename("total").reset_index()
order_total

In [None]:
result = df.merge(order_total, on='order')
result["portion"] = result["ext price"] / result["total"]
result.head()

### same operation with "transform"

In [None]:
df['total'] = df.groupby('order')["ext price"].transform(np.sum)
df["portion"] = df["ext price"] / df["total"]
df.head()

># 4.6. pivot & pivot_table
* dataframe의 형태 변경 (index, column, data를 명시)

### original DF

In [None]:
plant_df = pd.read_csv('data/PlantGrowth.csv')
plant_df.head(10)

### DF reshaped with pivot

In [None]:
plant_df.pivot(index = 'observation', columns = 'group', values = 'weight')

### DF reshaped with pivot_table

In [None]:
pd.pivot_table(plant_df, values = 'weight', index = 'observation', columns='group')

In [None]:
pd.pivot_table(plant_df, values = 'weight', columns='group')

In [None]:
pd.pivot_table(plant_df, values = 'weight', columns='observation')

### pivot_table uses aggregate method when needed

In [None]:
table = OrderedDict((("Item", ['Item0', 'Item0', 'Item0', 'Item1']), ('CType',['Gold', 'Bronze', 'Gold', 'Silver']), ('US',  [1, 2, 3, 4])))
df = pd.DataFrame(table)
df

In [None]:
df.pivot_table(index='Item', columns='CType', values='US')
# "Item0 - Gold" has two values, 1 and 3
# pivot_table adds up those values

># 4.7. stack & unstack
* stack : inner-most column -> inner-most index
* unstack : inner-most index -> inner-most column

In [None]:
plant_df = pd.read_csv('data/PlantGrowth.csv')
plant_df.head(10)

In [None]:
stacked = plant_df.set_index(['group', 'observation'])
stacked

In [None]:
# 가장 바깥 레벨의 row -> column으로 이동
stacked.unstack()

In [None]:
# 레벨 0의 row -> column으로 이동
stacked.unstack(0)

# 레벨의 순서가 아닌, 네임으로도 명시 가능
# stacked.unstack(level = 'group')

In [None]:
# 컬럼을 다시 가장 하위의 row로 이동
stacked.unstack(0).stack()

># 4.8. concat
* **axis**: 0 $\rightarrow$ row concat (default) / 1 $\rightarrow$ column concat
* **ignore_index**: True $\rightarrow$ set new 0-based index
* if the column names are differnet, NaN is filled instead

In [52]:
df1 = pd.DataFrame({'key1' : np.arange(5), 'value1' : np.random.randn(5)})
df2 = pd.DataFrame({'key1' : np.arange(5), 'value1' : np.random.randn(5)})
pd.concat([df1, df2])

Unnamed: 0,key1,value1
0,0,-1.4
1,1,1.55
2,2,1.14
3,3,0.29
4,4,0.1
0,0,-1.42
1,1,0.68
2,2,0.05
3,3,0.32
4,4,1.03


In [56]:
pd.concat([df1, df2], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3
0,0,-1.4,0,-1.42
1,1,1.55,1,0.68
2,2,1.14,2,0.05
3,3,0.29,3,0.32
4,4,0.1,4,1.03


In [60]:
df3 = pd.DataFrame({'key2' : np.arange(5), 'value2' : np.random.randn(5)})
pd.concat([df1, df3], ignore_index=True)

Unnamed: 0,key1,key2,value1,value2
0,0.0,,-1.4,
1,1.0,,1.55,
2,2.0,,1.14,
3,3.0,,0.29,
4,4.0,,0.1,
5,,0.0,,-0.33
6,,1.0,,-2.69
7,,2.0,,0.24
8,,3.0,,1.58
9,,4.0,,-1.57


### shared index

In [65]:
df11 = df1.set_index('key1')
df22 = df1.set_index('key1')
pd.concat([df11, df22], axis=1)

Unnamed: 0_level_0,value1,value1
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-1.4,-1.4
1,1.55,1.55
2,1.14,1.14
3,0.29,0.29
4,0.1,0.1


># 4.9. Merge & Join
* **on**: the column to be joined
* **how**: inner(default), left, right, outer

In [3]:
df1 = pd.DataFrame({'season': [1,2,3], 'team': ['LA', 'SF', 'NY'], 'winning rate': np.random.rand(3)})
df1.head()

Unnamed: 0,season,team,winning rate
0,1,LA,0.682425
1,2,SF,0.180681
2,3,NY,0.536681


In [4]:
df2 = pd.DataFrame({'season' : [1,1,2,4], 'team' : ['LA', 'LA', 'NY', 'KC'], 'winning rate' : np.random.rand(4)})
df2.head()

Unnamed: 0,season,team,winning rate
0,1,LA,0.560241
1,1,LA,0.185491
2,2,NY,0.230187
3,4,KC,0.928912


In [5]:
pd.merge(df1, df2, on=['season','team'])

Unnamed: 0,season,team,winning rate_x,winning rate_y
0,1,LA,0.682425,0.560241
1,1,LA,0.682425,0.185491


In [99]:
pd.merge(df1, df2, on='season', how='left') #merge with respect to df1, fill NaN for empty values

Unnamed: 0,season,team_x,winning rate_x,team_y,winning rate_y
0,1,LA,0.62,LA,0.02
1,1,LA,0.62,LA,0.65
2,2,SF,0.71,NY,0.56
3,3,NY,0.75,,


In [100]:
pd.merge(df1, df2, on='season', how='right') #merge with respect to df2, fill NaN for empty values

Unnamed: 0,season,team_x,winning rate_x,team_y,winning rate_y
0,1,LA,0.62,LA,0.02
1,1,LA,0.62,LA,0.65
2,2,SF,0.71,NY,0.56
3,4,,,KC,0.56


In [102]:
pd.merge(df1, df2, on='season', how='outer') #left merge + right merge

Unnamed: 0,season,team_x,winning rate_x,team_y,winning rate_y
0,1,LA,0.62,LA,0.02
1,1,LA,0.62,LA,0.65
2,2,SF,0.71,NY,0.56
3,3,NY,0.75,,
4,4,,,KC,0.56


### multi key join

In [103]:
pd.merge(df1, df2, on=['season', 'team'])

Unnamed: 0,season,team,winning rate_x,winning rate_y
0,1,LA,0.62,0.02
1,1,LA,0.62,0.65


### customize suffix

In [104]:
pd.merge(df1, df2, on=['season', 'team'], how='left', suffixes=('_1/2', '_2/2'))

Unnamed: 0,season,team,winning rate_1/2,winning rate_2/2
0,1,LA,0.62,0.02
1,1,LA,0.62,0.65
2,2,SF,0.71,
3,3,NY,0.75,


### merge over index
* left_index & right_index

In [113]:
df11 = df1.set_index('season')
df22 = df2.set_index('season')
pd.merge(df11, df22, left_index=True, right_index=True, how='outer')

Unnamed: 0_level_0,team_x,winning rate_x,team_y,winning rate_y
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,LA,0.62,LA,0.02
1,LA,0.62,LA,0.65
2,SF,0.71,NY,0.56
3,NY,0.75,,
4,,,KC,0.56
