# 置换和随机抽样

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(np.arange(5*4).reshape(5, 4))

In [4]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [5]:
sampler = np.random.permutation(5)

In [6]:
sampler

array([3, 4, 2, 1, 0])

In [7]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3


In [11]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


## 带有替代值的样本

In [12]:
choices = pd.Series([5, 7, -1, 6, 4])

In [13]:
draws = choices.sample(n=10, replace=True)

In [15]:
type(draws)

pandas.core.series.Series

<br><br>
## 计算指标、虚拟变量

In [18]:
df2 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'], 'data1':range(6)})

In [19]:
df2

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [20]:
pd.get_dummies(df2['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [22]:
dummies = pd.get_dummies(df2['key'], prefix='key')

In [23]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [26]:
df2_with_dummy = df2[['data1']].join(dummies)

In [27]:
df2_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


## MovieLens数据

In [28]:
# id, 名称, 流派
mnames = ['movie_id', 'title', 'genres']

In [29]:
movies = pd.read_table('datas/movies.dat', sep='::', header=None, names=mnames)

  """Entry point for launching an IPython kernel.


In [30]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [31]:
all_genres = []

In [32]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [33]:
# 去重
genres = pd.unique(all_genres)

In [34]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [40]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [45]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [47]:
# 全是0
dummies.shape

(3883, 18)

In [60]:
gen = movies.genres[9]

In [61]:
gen.split('|')

['Action', 'Adventure', 'Thriller']

In [62]:
dummies.columns.get_indexer(gen.split('|'))

array([7, 3, 9])

In [53]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [55]:
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [57]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

## get_dummies 与 cut 函数结合使用

In [63]:
np.random.seed(1234)

In [64]:
values = np.random.rand(10)

In [65]:
values

array([0.19151945, 0.62210877, 0.43772774, 0.78535858, 0.77997581,
       0.27259261, 0.27646426, 0.80187218, 0.95813935, 0.87593263])

In [67]:
bins = np.linspace(0, 1, 6)

In [69]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0
5,0,1,0,0,0
6,0,1,0,0,0
7,0,0,0,0,1
8,0,0,0,0,1
9,0,0,0,0,1
