In [34]:
import numpy as np
import pandas as pd

## Create pandas DataFrames
```python
# dictionary of lists
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)


```

In [35]:
# From a numpy array
df = pd.DataFrame( np.arange(20).reshape(5,4), columns=['alpha', 'beta', 'gamma', 'delta'])
df

Unnamed: 0,alpha,beta,gamma,delta
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [36]:
type(df)

pandas.core.frame.DataFrame

## Indexing

In [37]:
# select a column
df['alpha']

0     0
1     4
2     8
3    12
4    16
Name: alpha, dtype: int64

In [38]:
# select rows
df.iloc[:2]

Unnamed: 0,alpha,beta,gamma,delta
0,0,1,2,3
1,4,5,6,7


## DataFrame math

In [39]:
# direct math
df2 = (9/5) * df + 32
df2

Unnamed: 0,alpha,beta,gamma,delta
0,32.0,33.8,35.6,37.4
1,39.2,41.0,42.8,44.6
2,46.4,48.2,50.0,51.8
3,53.6,55.4,57.2,59.0
4,60.8,62.6,64.4,66.2


In [40]:
# add two dataframes of same shape
df + df2

Unnamed: 0,alpha,beta,gamma,delta
0,32.0,34.8,37.6,40.4
1,43.2,46.0,48.8,51.6
2,54.4,57.2,60.0,62.8
3,65.6,68.4,71.2,74.0
4,76.8,79.6,82.4,85.2


In [41]:
# map a function to each column
f = lambda x: x.max() - x.min()

df.apply(f)

alpha    16
beta     16
gamma    16
delta    16
dtype: int64

## DataFrame manipulation

In [42]:
# add a column
df['epsilon'] = ['low', 'medium', 'low', 'high', 'high']
df

Unnamed: 0,alpha,beta,gamma,delta,epsilon
0,0,1,2,3,low
1,4,5,6,7,medium
2,8,9,10,11,low
3,12,13,14,15,high
4,16,17,18,19,high


In [43]:
df.shape

(5, 5)

In [44]:
pd.DataFrame.drop?

In [45]:
# delete column
df.drop(columns=['gamma'])


Unnamed: 0,alpha,beta,delta,epsilon
0,0,1,3,low
1,4,5,7,medium
2,8,9,11,low
3,12,13,15,high
4,16,17,19,high


In [46]:
df

Unnamed: 0,alpha,beta,gamma,delta,epsilon
0,0,1,2,3,low
1,4,5,6,7,medium
2,8,9,10,11,low
3,12,13,14,15,high
4,16,17,18,19,high


In [47]:
# sorting values
df.sort_values(by='epsilon')

Unnamed: 0,alpha,beta,gamma,delta,epsilon
3,12,13,14,15,high
4,16,17,18,19,high
0,0,1,2,3,low
2,8,9,10,11,low
1,4,5,6,7,medium


## Load data from file

Using `np.genfromtxt()`

In [48]:
# skip first row
data = pd.read_csv('data.csv')

In [49]:
data.shape

(293, 14)

In [50]:
data.head()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [51]:
data.columns

Index(['age', 'gender', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

## Summarize values
What is the mean, std, min, max in each column?

In [52]:
data.mean()

age        47.767918
gender      0.723549
cp          2.979522
oldpeak     0.584642
num         0.358362
dtype: float64

In [53]:
# where are the other columns? Check dtypes
data.dtypes

age           int64
gender        int64
cp            int64
trestbps     object
chol         object
fbs          object
restecg      object
thalach      object
exang        object
oldpeak     float64
slope        object
ca           object
thal         object
num           int64
dtype: object

In [54]:
# convert dtypes
# data.astype('float').dtypes

In [55]:
# replace '?' with 'NaN'
data = data.replace({'?': 'NaN'})
data.head()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132.0,0,2,185,0,0.0,,,,0
1,29,1,2,120,243.0,0,0,160,0,0.0,,,,0
2,29,1,2,140,,0,0,170,0,0.0,,,,0
3,30,0,1,170,237.0,0,1,170,0,0.0,,,6.0,0
4,31,0,2,100,219.0,0,1,150,0,0.0,,,,0


In [56]:
# convert dtypes
data = data.astype('float')
data.dtypes

age         float64
gender      float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
num         float64
dtype: object

In [57]:
# we could have loaded the data with na_values argument
data = pd.read_csv('data.csv', na_values='?')

In [58]:
data.describe() # ignores NaN

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,293.0,293.0,293.0,292.0,270.0,285.0,292.0,292.0,292.0,293.0,103.0,3.0,28.0,293.0
mean,47.767918,0.723549,2.979522,132.592466,250.759259,0.070175,0.215753,139.212329,0.30137,0.584642,1.893204,0.0,5.642857,0.358362
std,7.76015,0.448007,0.964928,17.656176,67.767297,0.255892,0.459372,23.587727,0.459641,0.909879,0.34049,0.0,1.615074,0.48034
min,28.0,0.0,1.0,92.0,85.0,0.0,0.0,82.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,42.0,0.0,2.0,120.0,209.0,0.0,0.0,122.0,0.0,0.0,2.0,0.0,5.25,0.0
50%,49.0,1.0,3.0,130.0,243.0,0.0,0.0,140.0,0.0,0.0,2.0,0.0,6.0,0.0
75%,54.0,1.0,4.0,140.0,282.75,0.0,0.0,155.0,1.0,1.0,2.0,0.0,7.0,1.0
max,66.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,3.0,0.0,7.0,1.0


## Find nans
How many nans in each column?

In [59]:
data.isnull()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,False,False,False,False,False,False,False,False,False,False,True,True,True,False
1,False,False,False,False,False,False,False,False,False,False,True,True,True,False
2,False,False,False,False,True,False,False,False,False,False,True,True,True,False
3,False,False,False,False,False,False,False,False,False,False,True,True,False,False
4,False,False,False,False,False,False,False,False,False,False,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,False,False,False,False,False,False,False,False,False,False,False,True,True,False
289,False,False,False,False,False,False,False,False,False,False,True,True,True,False
290,False,False,False,False,False,False,False,False,False,False,False,True,True,False
291,False,False,False,False,False,False,False,False,False,False,False,True,True,False


In [60]:
data.isnull().sum()

age           0
gender        0
cp            0
trestbps      1
chol         23
fbs           8
restecg       1
thalach       1
exang         1
oldpeak       0
slope       190
ca          290
thal        265
num           0
dtype: int64

In [61]:
data.fillna(data.min()).describe()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0
mean,47.767918,0.723549,2.979522,132.453925,237.74744,0.068259,0.215017,139.017065,0.300341,0.584642,1.313993,0.0,3.25256,0.358362
std,7.76015,0.448007,0.964928,17.784731,78.898698,0.252622,0.458758,23.783334,0.459191,0.909879,0.472217,0.0,0.920301,0.48034
min,28.0,0.0,1.0,92.0,85.0,0.0,0.0,82.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,42.0,0.0,2.0,120.0,198.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0,3.0,0.0
50%,49.0,1.0,3.0,130.0,237.0,0.0,0.0,140.0,0.0,0.0,1.0,0.0,3.0,0.0
75%,54.0,1.0,4.0,140.0,277.0,0.0,0.0,155.0,1.0,1.0,2.0,0.0,3.0,1.0
max,66.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,3.0,0.0,7.0,1.0


## Remove duplicate rows

## Count unique values (a histogram)

In [62]:
data['age'].value_counts()

54    25
48    19
52    17
55    15
49    15
46    13
53    12
43    12
50    12
39    11
41    11
47    10
56    10
51     9
58     9
59     8
37     8
45     8
44     7
42     7
40     7
38     7
36     5
35     5
57     5
32     4
34     4
60     2
33     2
31     2
65     2
61     2
62     2
29     2
30     1
66     1
63     1
28     1
Name: age, dtype: int64