# Data wrangling

## Arrays

- **Numpy**: numeric computation
 - array operations
 - masking
 - smoothening filter

In [1]:
import numpy as np

usually homogenous

must have all dimensions equal, can't have 
```
[
 [1,2,3], 
 [1,2]
]
```

In [262]:
squares = np.array([0, 1, 4, 9, 16, 25, 36])

In [264]:
squares + 100

array([100, 101, 104, 109, 116, 125, 136])

In [266]:
squares.mean()

13.0

In [324]:
np.arange(10) ** 3

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729])

---

In [325]:
mask = (squares > 5)

In [326]:
mask

array([False, False, False,  True,  True,  True,  True])

In [327]:
squares[mask]

array([ 9, 16, 25, 36])

---

In [343]:
# arrays can be n-dimensional
m = np.array([
    [5, 2, 3],
    [4, 5, 1],
    [7, 1, 2],
    [6, 2, 9]
])

In [344]:
m.shape

(4, 3)

In [345]:
m * 10

array([[50, 20, 30],
       [40, 50, 10],
       [70, 10, 20],
       [60, 20, 90]])

In [346]:
m[:2]  # first two rows

array([[5, 2, 3],
       [4, 5, 1]])

In [348]:
m[:, :2]  # all rows, first two columns

array([[5, 2],
       [4, 5],
       [7, 1],
       [6, 2]])

In [349]:
m[:2, :2]  # first two rows of the first two columns

array([[5, 2],
       [4, 5]])

---

reshaping

In [39]:
a = np.arange(12)

In [38]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [43]:
a.reshape(4, 3)  # 4 rows, 3 columns

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

---

In [None]:
more than two dimensions

In [23]:
R = [1, 0, 0]  # red
B = [0, 0, 1]  # blue
W = [1, 1, 1]  # white

In [255]:
picture = np.array([
    [B, B, R, R],
    [B, B, W, W],
    [R, R, R, R],
    [W, W, W, W],
    [R, R, R, R],
    [W, W, W, W],
])

In [256]:
picture

array([[[0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0]],

       [[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]])

In [257]:
picture.shape

(6, 4, 3)

In [12]:
# rank is the number of dimensions

In [None]:
most functions allow you to specify an axis

In [258]:
picture.sum()  # overall

44

In [259]:
picture.sum(axis=0)  # sum of each column

array([[4, 2, 4],
       [4, 2, 4],
       [6, 3, 3],
       [6, 3, 3]])

In [260]:
picture.sum(axis=1)  # sum of each row

array([[2, 0, 2],
       [2, 2, 4],
       [4, 0, 0],
       [4, 4, 4],
       [4, 0, 0],
       [4, 4, 4]])

In [261]:
picture.sum(axis=2)  # the sum of all three color channels

array([[1, 1, 1, 1],
       [1, 1, 3, 3],
       [1, 1, 1, 1],
       [3, 3, 3, 3],
       [1, 1, 1, 1],
       [3, 3, 3, 3]])

---

### Broadcasting

In [329]:
picture

array([[[0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0]],

       [[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]])

In [334]:
picture + 10

array([[[10, 10, 11],
        [10, 10, 11],
        [11, 10, 10],
        [11, 10, 10]],

       [[10, 10, 11],
        [10, 10, 11],
        [11, 11, 11],
        [11, 11, 11]],

       [[11, 10, 10],
        [11, 10, 10],
        [11, 10, 10],
        [11, 10, 10]],

       [[11, 11, 11],
        [11, 11, 11],
        [11, 11, 11],
        [11, 11, 11]],

       [[11, 10, 10],
        [11, 10, 10],
        [11, 10, 10],
        [11, 10, 10]],

       [[11, 11, 11],
        [11, 11, 11],
        [11, 11, 11],
        [11, 11, 11]]])

In [333]:
picture + [10, 0, 0]

array([[[10,  0,  1],
        [10,  0,  1],
        [11,  0,  0],
        [11,  0,  0]],

       [[10,  0,  1],
        [10,  0,  1],
        [11,  1,  1],
        [11,  1,  1]],

       [[11,  0,  0],
        [11,  0,  0],
        [11,  0,  0],
        [11,  0,  0]],

       [[11,  1,  1],
        [11,  1,  1],
        [11,  1,  1],
        [11,  1,  1]],

       [[11,  0,  0],
        [11,  0,  0],
        [11,  0,  0],
        [11,  0,  0]],

       [[11,  1,  1],
        [11,  1,  1],
        [11,  1,  1],
        [11,  1,  1]]])

In [359]:
picture + [
    [10, 0,  0],
    [10, 0,  0],
    [0,  0, 10],
    [0,  0, 10],
]  # add this to each horizontal line of pixels in the picture

array([[[10,  0,  1],
        [10,  0,  1],
        [ 1,  0, 10],
        [ 1,  0, 10]],

       [[10,  0,  1],
        [10,  0,  1],
        [ 1,  1, 11],
        [ 1,  1, 11]],

       [[11,  0,  0],
        [11,  0,  0],
        [ 1,  0, 10],
        [ 1,  0, 10]],

       [[11,  1,  1],
        [11,  1,  1],
        [ 1,  1, 11],
        [ 1,  1, 11]],

       [[11,  0,  0],
        [11,  0,  0],
        [ 1,  0, 10],
        [ 1,  0, 10]],

       [[11,  1,  1],
        [11,  1,  1],
        [ 1,  1, 11],
        [ 1,  1, 11]]])

In [None]:
dtype

## Dataframes

- **Pandas**: powerful dataframes
 - pivoting
 - grouping

In [45]:
import pandas as pd

In [None]:
# like matrix, but made for labeled data, and with functionality built in

In [188]:
students = pd.DataFrame({
    'height':    np.random.randint(150, 200, size=5),
    'weight':    np.random.randint(50,  100, size=5),
    'graduated': np.random.random(size=5) > .5,  # same as np.random.randint(0, 1, size=10).astype(bool)
})

In [189]:
students

Unnamed: 0,height,weight,graduated
0,176,97,False
1,157,86,False
2,150,97,True
3,184,50,False
4,150,52,True


In [190]:
students.index = list('abcde')  # it is implicitly indexed by numbers, but we can change that

In [191]:
students

Unnamed: 0,height,weight,graduated
a,176,97,False
b,157,86,False
c,150,97,True
d,184,50,False
e,150,52,True


In [192]:
len(students)

5

In [240]:
students.dtypes

height       int64
weight       int64
graduated     bool
age          int64
can_ride      bool
dtype: object

In [242]:
students.height.astype(float)

a    176.0
b    157.0
c    150.0
d    184.0
e    150.0
x    170.0
Name: height, dtype: float64

### Accessing

In [193]:
students.loc['a']  # we can now acess either by their name

height         176
weight          97
graduated    False
Name: a, dtype: object

In [194]:
students.iloc[0]  # or by their position

height         176
weight          97
graduated    False
Name: a, dtype: object

In [243]:
students[:3]

Unnamed: 0,height,weight,graduated,age,can_ride
a,176,97,False,23,True
b,157,86,False,19,False
c,150,97,True,21,False


In [195]:
students.graduated  # we can also access column-wise

a    False
b    False
c     True
d    False
e     True
Name: graduated, dtype: bool

In [277]:
columns = ['weight', 'height']
students[columns]

Unnamed: 0,weight,height
a,97,176
b,86,157
c,97,150
d,50,184
e,52,150
x,70,170


In [196]:
students[students.graduated]  # use the boolean column as a mask

Unnamed: 0,height,weight,graduated
c,150,97,True
e,150,52,True


In [244]:
for column in students:
    print(column)

height
weight
graduated
age
can_ride


In [246]:
for student, rows in students.iterrows():
    print(student, rows.age)

a 23
b 19
c 21
d 23
e 22
x 23


### Adding

In [197]:
students.loc['x'] = (170, 70, True)  # adding a new row

In [198]:
students

Unnamed: 0,height,weight,graduated
a,176,97,False
b,157,86,False
c,150,97,True
d,184,50,False
e,150,52,True
x,170,70,True


In [199]:
students['age'] = np.random.randint(18, 24, size=6)  # adding a new column

In [200]:
students

Unnamed: 0,height,weight,graduated,age
a,176,97,False,23
b,157,86,False,19
c,150,97,True,21
d,184,50,False,23
e,150,52,True,22
x,170,70,True,23


In [220]:
# create a column based on another
students['can_ride'] = students.height > 170  # "you must be this tall to ride the roller coaster"

In [221]:
students

Unnamed: 0,height,weight,graduated,age,can_ride
a,176,97,False,23,True
b,157,86,False,19,False
c,150,97,True,21,False
d,184,50,False,23,True
e,150,52,True,22,False
x,170,70,True,23,False


---

In [268]:
new_students = pd.DataFrame({
    'height': [160, 180],
    'weight': [ 60,  80],
})

In [273]:
new_students

Unnamed: 0,height,weight
0,160,60
1,180,80


In [294]:
students = pd.concat([students, new_students], sort=False)
students

Unnamed: 0,height,weight,graduated,age,can_ride
a,176,97,False,23.0,True
b,157,86,False,19.0,False
c,150,97,True,21.0,False
d,184,50,False,23.0,True
e,150,52,True,22.0,False
x,170,70,True,23.0,False
0,160,60,,,
1,180,80,,,


In [317]:
n_students = len(students)
new_info = pd.DataFrame({
    'fav_number':   np.random.randint(0, 100, size=n_students),
    'fav_icecream': np.random.choice(['vanilla', 'chocolate', 'strawberry'], size=n_students),
})
new_info.index = students.index

In [305]:
new_info

Unnamed: 0,fav_number,fav_icecream
a,42,strawberry
b,60,chocolate
c,38,strawberry
d,68,strawberry
e,90,vanilla
x,0,chocolate
0,47,vanilla
1,9,chocolate


In [308]:
students = students.merge(new_info, left_index=True, right_index=True)
students

Unnamed: 0,height,weight,graduated,age,can_ride,fav_number,fav_icecream
a,176,97,False,23.0,True,42,strawberry
b,157,86,False,19.0,False,60,chocolate
c,150,97,True,21.0,False,38,strawberry
d,184,50,False,23.0,True,68,strawberry
e,150,52,True,22.0,False,90,vanilla
x,170,70,True,23.0,False,0,chocolate
0,160,60,,,,47,vanilla
1,180,80,,,,9,chocolate


---

### missing values

In [313]:
students.style.highlight_null()

Unnamed: 0,height,weight,graduated,age,can_ride,fav_number,fav_icecream
a,176,97,False,23.0,True,42,strawberry
b,157,86,False,19.0,False,60,chocolate
c,150,97,True,21.0,False,38,strawberry
d,184,50,False,23.0,True,68,strawberry
e,150,52,True,22.0,False,90,vanilla
x,170,70,True,23.0,False,0,chocolate
0,160,60,,,,47,vanilla
1,180,80,,,,9,chocolate


In [319]:
pd.isna(students)

Unnamed: 0,height,weight,graduated,age,can_ride,fav_number,fav_icecream
a,False,False,False,False,False,False,False
b,False,False,False,False,False,False,False
c,False,False,False,False,False,False,False
d,False,False,False,False,False,False,False
e,False,False,False,False,False,False,False
x,False,False,False,False,False,False,False
0,False,False,True,True,True,False,False
1,False,False,True,True,True,False,False


In [322]:
pd.isna(students.age)

a    False
b    False
c    False
d    False
e    False
x    False
0     True
1     True
Name: age, dtype: bool

In [321]:
students.fillna({
    'graduated': False,
    'can_ride':  False,
    'age':       20,
})

Unnamed: 0,height,weight,graduated,age,can_ride,fav_number,fav_icecream
a,176,97,False,23.0,True,42,strawberry
b,157,86,False,19.0,False,60,chocolate
c,150,97,True,21.0,False,38,strawberry
d,184,50,False,23.0,True,68,strawberry
e,150,52,True,22.0,False,90,vanilla
x,170,70,True,23.0,False,0,chocolate
0,160,60,False,20.0,False,47,vanilla
1,180,80,False,20.0,False,9,chocolate


---

### Deleting

In [282]:
students.drop('height', axis=1)  # delete entire columns

Unnamed: 0,weight,graduated,age,can_ride
a,97,False,23,True
b,86,False,19,False
c,97,True,21,False
d,50,False,23,True
e,52,True,22,False
x,70,True,23,False


In [286]:
mask = (students.age > 21)  # delete some rows
students[mask]

Unnamed: 0,height,weight,graduated,age,can_ride
a,176,97,False,23,True
d,184,50,False,23,True
e,150,52,True,22,False
x,170,70,True,23,False


### Array operations

In [203]:
# array-wise functions have been extended to Series

In [204]:
students.mean()  # not that the mean of the boolean variable `graduated` is precisely the percentage of students who graduated

height       164.500000
weight        75.333333
graduated      0.500000
age           21.833333
can_ride       0.333333
dtype: float64

In [205]:
students.mean(axis=1)  # the average for each student, of their height, weight and graduation status.. which does not make much sense

a    59.4
b    52.4
c    53.8
d    51.6
e    45.0
x    52.8
dtype: float64

In [206]:
students.weight - 10  # if only it were this easy to lose weight 😅

a    87
b    76
c    87
d    40
e    42
x    60
Name: weight, dtype: int64

In [229]:
students.height + students.weight

a    273
b    243
c    247
d    234
e    202
x    240
dtype: int64

In [226]:
students.graduated & ~students.can_ride  # students who graduated but cannot ride

a    False
b    False
c     True
d    False
e     True
x     True
dtype: bool

In [235]:
students.weight.apply(lambda w: (w // 10) * 10)  # apply to each element of a column

a    90
b    80
c    90
d    50
e    50
x    70
Name: weight, dtype: int64

In [236]:
students.apply(lambda row: row.height + row.weight, axis=1)  # apply row-wise

a    273
b    243
c    247
d    234
e    202
x    240
dtype: int64

In [238]:
def relabel_boolean(x):
    # if the argument is not a boolean, leave it as it is
    if type(x) is not bool:
        return x
    # True -> yes; False -> no
    return 'yes' if x is True else 'no'

In [239]:
students.applymap(relabel_boolean)  # apply element-wise

Unnamed: 0,height,weight,graduated,age,can_ride
a,176,97,no,23,yes
b,157,86,no,19,no
c,150,97,yes,21,no
d,184,50,no,23,yes
e,150,52,yes,22,no
x,170,70,yes,23,no


### Data manipulation

In [169]:
# pivoting "flips" the data according to a discrete variable
pd.pivot_table(
    students,
    index='graduated',
    values=['height', 'weight'],
    aggfunc='mean',
)

Unnamed: 0_level_0,height,weight
graduated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,177.5,74.5
True,171.75,98.0


In [170]:
pd.pivot_table(
    df,
    index='graduated',
    values=['height', 'weight'],
    aggfunc='max',
)

Unnamed: 0_level_0,height,weight
graduated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,197,117
True,199,118


---

Melting can be thought of as the reverse of pivoting

In [208]:
height_evolution = pd.DataFrame({
    'alice': np.linspace(160, 190, num=5),
    'bob':   np.linspace(170, 180, num=5),
    'year':  range(2000, 2005),
})

In [209]:
height_evolution

Unnamed: 0,alice,bob,year
0,160.0,170.0,2000
1,167.5,172.5,2001
2,175.0,175.0,2002
3,182.5,177.5,2003
4,190.0,180.0,2004


In [212]:
melted = height_evolution.melt(
    id_vars='year',
    value_vars=['alice', 'bob'],

    var_name='person',
    value_name='height',
)
melted

Unnamed: 0,year,person,height
0,2000,alice,160.0
1,2001,alice,167.5
2,2002,alice,175.0
3,2003,alice,182.5
4,2004,alice,190.0
5,2000,bob,170.0
6,2001,bob,172.5
7,2002,bob,175.0
8,2003,bob,177.5
9,2004,bob,180.0


In [216]:
melted.person.str.title()  # apply string functions on the entire column

0    Alice
1    Alice
2    Alice
3    Alice
4    Alice
5      Bob
6      Bob
7      Bob
8      Bob
9      Bob
Name: person, dtype: object

In [278]:
max_height = height_evolution.max()

In [279]:
max_height

alice     190.0
bob       180.0
year     2004.0
dtype: float64

---

In [135]:
# you can also create dataframes row-wise
performance = pd.DataFrame([
    ('alice', 'CS101', 4.0),
    ('alice', 'CS102', 3.0),
    ('alice', 'CS201', 4.0),
    ('bob',   'CS101', 3.0),
    ('bob',   'CS201', 4.0),
], columns=['student', 'class', 'grade'])

In [136]:
performance

Unnamed: 0,student,class,grade
0,alice,CS101,4.0
1,alice,CS102,3.0
2,alice,CS201,4.0
3,bob,CS101,3.0
4,bob,CS201,4.0


In [137]:
performance.groupby('student').grade.mean()

student
alice    3.666667
bob      3.500000
Name: grade, dtype: float64

In [138]:
for student, rows in performance.groupby('student'):
    print(student, 'took', len(rows), 'classes, with an average of', rows.grade.mean())

alice took 3 classes, with an average of 3.6666666666666665
bob took 2 classes, with an average of 3.5


---

### Loading

In [372]:
pd.read_json('example_files/objects.json')

Unnamed: 0,grade,name,year
0,3.9,Alice,2
1,3.8,Bob,3
2,3.85,Chris,1


In [373]:
pd.read_csv('example_files/tabular.csv')

Unnamed: 0,grade,name,year
0,3.9,Alice,2
1,3.8,Bob,3
2,3.85,Chris,1


In [384]:
import html5lib

In [387]:
pd.read_html('http://en.wikipedia.org/wiki/Python_(programming_language)')

ImportError: lxml not found, please install it

In [None]:
performance.to_csv('students_performance.csv')