In [6]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [7]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [9]:
# Unique values in a series. The return is an array
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [11]:
# Count the number of appearances of each value
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [15]:
# Apply value_count to an array
pd.value_counts(obj.values)

c    3
a    3
b    2
d    1
dtype: int64

In [18]:
# Checking membership
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [19]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [20]:
# Another function about index
# It is less easy to understand
# Create the first series
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [22]:
# Create the second seires
unique_vals = pd.Series(['c', 'b', 'a'])
unique_vals

0    c
1    b
2    a
dtype: object

In [24]:
# Create an index object from the values in the second series
pd.Index(unique_vals)

Index(['c', 'b', 'a'], dtype='object')

In [25]:
# Why creating the second series? Why not just using the list?
pd.Index(['c', 'b', 'a'])

Index(['c', 'b', 'a'], dtype='object')

In [26]:
pd.Index(unique_vals).get_indexer(to_match)
# The values in to_match are 'c', 'a', 'b', 'b', 'c', 'a'
# 'c' is the 0th index in pd.Index, therefore return 0
# 'a' is the 2nd index in pd.Index, therefore return 2

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [27]:
# compute a histogram
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [30]:
# Count the number of appearance of each value
# Note that pd.value_counts only acts on series
pd.value_counts(data['Qu1'])

3    2
4    2
1    1
Name: Qu1, dtype: int64

In [32]:
# To apply pd.value_counts to a dataframe, we need to use the 'apply' function
# Note that the result contains NaN, which means a certain value does not show up in a column
# Also, the index is automatically changed to the values in the original dataframe
data.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [33]:
# Replace NaN by zero
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [34]:
# Change an entry to 10
data1 = pd.DataFrame({'Qu1': [1, 3, 4, 3, 10], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]})
data1

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,10,3,4


In [37]:
# Note that '10' appears in the index
data1.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,1.0,0.0,2.0
5,0.0,0.0,1.0
10,1.0,0.0,0.0
