# pandas 03 - DataFrame

by Nova@Douban

The video record of this session is here: https://zoom.us/recording/share/_nQCq70yxJPc9OMt64BtiBghcXzSQVKz0ROTB6yZxQ2wIumekTziMw

---

## 3.1 Concept

A DataFrame:

1. integrates multiple Series objects by aligning them along common index labels.


2. can be thought of as a dictionary-like container of one or more Series objects


3. or as a spreadsheet.

## 3.2 Creating a DataFrame

1. import from disk, e.g.:
    
    1. `pd.read_csv()`
    2. `pd.read_json()`
    
2. convert from other Python objects, e.g.:

    1. `pd.DataFrame.from_dict(dic)`
    
We will introduce this in detail in next talk.

In [1]:
import pandas as pd

df1 = pd.read_csv('../data/nasdaq.csv')
df1.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-11-23,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000
1,2018-11-26,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000
2,2018-11-27,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000
3,2018-11-28,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000
4,2018-11-29,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000


In [2]:
df2 = pd.read_json('../data/nasdaq.json', lines=True)
df2.head()

Unnamed: 0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
0,6938.97998,6919.52002,2018-11-23,6987.890137,6919.160156,68.729981,6919.52002,6919.52002,958950000
1,7081.850098,7026.5,2018-11-26,7083.930176,7003.120117,80.810059,7026.5,7026.5,2011180000
2,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000
3,7291.589844,7135.080078,2018-11-28,7292.709961,7090.97998,201.729981,7135.080078,7135.080078,2390260000
4,7273.080078,7267.370117,2018-11-29,7319.959961,7217.689941,102.27002,7267.370117,7267.370117,1983460000


In [3]:
dic = {'a': [3, 9, 4], 'o': [0, 8, 5], 'k': [7, 5, 7]}
pd.DataFrame.from_dict(dic)

Unnamed: 0,a,o,k
0,3,0,7
1,9,8,5
2,4,5,7


---

## 3.3 Selecting columns

Selecting columns by:

1. by positions

    1. `pd.iloc[:,int]`
    
    2. `pd.iloc[:,int:int]`

2. by labels

    1. `pd.loc[:column names]`
    
    2. passing a list of values to `[]`

        1. column names cannot contain any space.
    
        2. even a single column name must be passed to the `[]` operator and hence the double set of brackets `[[]]`.

    3. ~~`DataFrame.column_name`~~

In [4]:
df1.iloc[:, 1]

0     6919.520020
1     7026.500000
2     7041.229980
3     7135.080078
4     7267.370117
5     7279.299805
6     7486.129883
7     7407.950195
8     7017.049805
9     7163.490234
10    6959.629883
11    7121.660156
12    7127.000000
13    7135.279785
14    6986.370117
15    6886.459961
16    6809.819824
17    6777.589844
18    6607.759766
19    6573.490234
Name: Open, dtype: float64

In [5]:
df1.iloc[:, 1:3]

Unnamed: 0,Open,High
0,6919.52002,6987.890137
1,7026.5,7083.930176
2,7041.22998,7105.140137
3,7135.080078,7292.709961
4,7267.370117,7319.959961
5,7279.299805,7332.790039
6,7486.129883,7486.509766
7,7407.950195,7421.109863
8,7017.049805,7189.52002
9,7163.490234,7205.370117


In [6]:
df1[['Open','Close']]

Unnamed: 0,Open,Close
0,6919.52002,6938.97998
1,7026.5,7081.850098
2,7041.22998,7082.700195
3,7135.080078,7291.589844
4,7267.370117,7273.080078
5,7279.299805,7330.540039
6,7486.129883,7441.509766
7,7407.950195,7158.430176
8,7017.049805,7188.259766
9,7163.490234,6969.25


In [7]:
df1.loc[:,'Open']

0     6919.520020
1     7026.500000
2     7041.229980
3     7135.080078
4     7267.370117
5     7279.299805
6     7486.129883
7     7407.950195
8     7017.049805
9     7163.490234
10    6959.629883
11    7121.660156
12    7127.000000
13    7135.279785
14    6986.370117
15    6886.459961
16    6809.819824
17    6777.589844
18    6607.759766
19    6573.490234
Name: Open, dtype: float64

In [8]:
df1.loc[:,'Open':'Close']

Unnamed: 0,Open,High,Low,Close
0,6919.52002,6987.890137,6919.160156,6938.97998
1,7026.5,7083.930176,7003.120117,7081.850098
2,7041.22998,7105.140137,7014.359863,7082.700195
3,7135.080078,7292.709961,7090.97998,7291.589844
4,7267.370117,7319.959961,7217.689941,7273.080078
5,7279.299805,7332.790039,7255.680176,7330.540039
6,7486.129883,7486.509766,7392.220215,7441.509766
7,7407.950195,7421.109863,7150.109863,7158.430176
8,7017.049805,7189.52002,6984.339844,7188.259766
9,7163.490234,7205.370117,6945.27002,6969.25


---

## 3.4 Selecting rows by index


Selecting rows by:

1. by positions

    1. `pd.iloc[int]`
    
    2. `pd.iloc[int:int]`

2. by labels

    1. `pd.loc[row name]`
    
    2. passing a list of index number to `[]`
    
3. by boolean selection

    `pd.DataFrame[condition]`

In [9]:
df2.iloc[2]

Adj_close                       7082.7
Close                          7041.23
Date               2018-11-27 00:00:00
High                           7105.14
Low                            7014.36
Max_diff                       90.7803
Open                           7041.23
Open_close_diff                7041.23
Volume                      2067360000
Name: 2, dtype: object

In [10]:
df2.iloc[2:4]

Unnamed: 0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
2,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000
3,7291.589844,7135.080078,2018-11-28,7292.709961,7090.97998,201.729981,7135.080078,7135.080078,2390260000


In [11]:
df2.index = df2['Date']
df2.loc['2018-11-27']

Adj_close                       7082.7
Close                          7041.23
Date               2018-11-27 00:00:00
High                           7105.14
Low                            7014.36
Max_diff                       90.7803
Open                           7041.23
Open_close_diff                7041.23
Volume                      2067360000
Name: 2018-11-27 00:00:00, dtype: object

In [12]:
df2.loc['2018-11-27':'2018-11-28']

Unnamed: 0_level_0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-27,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000
2018-11-28,7291.589844,7135.080078,2018-11-28,7292.709961,7090.97998,201.729981,7135.080078,7135.080078,2390260000


In [13]:
df2[2:3]

Unnamed: 0_level_0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-27,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000


In [14]:
df2[df2['Open'] > 7000]

Unnamed: 0_level_0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26,7081.850098,7026.5,2018-11-26,7083.930176,7003.120117,80.810059,7026.5,7026.5,2011180000
2018-11-27,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000
2018-11-28,7291.589844,7135.080078,2018-11-28,7292.709961,7090.97998,201.729981,7135.080078,7135.080078,2390260000
2018-11-29,7273.080078,7267.370117,2018-11-29,7319.959961,7217.689941,102.27002,7267.370117,7267.370117,1983460000
2018-11-30,7330.540039,7279.299805,2018-11-30,7332.790039,7255.680176,77.109863,7279.299805,7279.299805,2542820000
2018-12-03,7441.509766,7486.129883,2018-12-03,7486.509766,7392.220215,94.289551,7486.129883,7486.129883,2621020000
2018-12-04,7158.430176,7407.950195,2018-12-04,7421.109863,7150.109863,271.0,7407.950195,7407.950195,2635810000
2018-12-06,7188.259766,7017.049805,2018-12-06,7189.52002,6984.339844,205.180176,7017.049805,7017.049805,2833870000
2018-12-07,6969.25,7163.490234,2018-12-07,7205.370117,6945.27002,260.100097,7163.490234,7163.490234,2475160000
2018-12-11,7031.830078,7121.660156,2018-12-11,7129.830078,6983.009766,146.820312,7121.660156,7121.660156,2246060000


In [15]:
df2[(df2['Open'] > 7000) & (df2['Close'] < 7100)]

Unnamed: 0_level_0,Adj_close,Close,Date,High,Low,Max_diff,Open,Open_close_diff,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26,7081.850098,7026.5,2018-11-26,7083.930176,7003.120117,80.810059,7026.5,7026.5,2011180000
2018-11-27,7082.700195,7041.22998,2018-11-27,7105.140137,7014.359863,90.780274,7041.22998,7041.22998,2067360000
2018-12-06,7188.259766,7017.049805,2018-12-06,7189.52002,6984.339844,205.180176,7017.049805,7017.049805,2833870000


---

## 3.5 A detailed view of `.loc` and `.iloc`

Both: 

1. access a group of rows and columns by index, label(s) or a boolean array;

2. if only one group of index/labels is passed to `[]`, they will be regarded as row;

3. if two groups of index/labels are passed (split by a comma), the first group will be regared as row, and the second group as column;

4. are recommended by official documentation for accessing data for the high performance.


`.loc`:

1. is primarily label based;    
    
`.iloc`:

1. is primarily integer position based;
2. index / int.

---

## 3.6 Modifying the structure and content of DataFrame


### 3.6.1 Renaming columns

1. a column can be renamed using the `.rename()` method;

2. source and target names can be put in a dictionary;

3. `inplace` can be set `True`.

In [16]:
df2 = pd.read_csv('../data/nasdaq.csv')
print(df2.columns)
col_dic = {'Adj Close': 'Adj_close', 'Open_close_diff': 'Open_Close_diff'}
df2.rename(columns=col_dic, inplace=True)
print()
print(df2.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj_close', 'Volume'], dtype='object')


---

### 3.6.2 Adding and inserting columns


1. use the `[]` operator to assign a Series to the rightmost of a DataFrame:

    1. This will modify the DataFrame in-place;

    2. pandas will frst align the data, and then fill in the data from the Series into the new DataFrame;


2. use the `.insert()` to any position of a DataFrame:

    1. `loc` defines the position;
    
    2. `value` defines the values to be inserted;
    
    3. `column` defines the new column name.

In [17]:
df2['Test'] = pd.Series(2, index=df2.index)
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj_close,Volume,Test
0,2018-11-23,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000,2
1,2018-11-26,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000,2
2,2018-11-27,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000,2
3,2018-11-28,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000,2
4,2018-11-29,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000,2


In [18]:
df2.insert(loc=1, value=pd.Series(3, index=df2.index), column='Test2')
df2.head()

Unnamed: 0,Date,Test2,Open,High,Low,Close,Adj_close,Volume,Test
0,2018-11-23,3,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000,2
1,2018-11-26,3,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000,2
2,2018-11-27,3,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000,2
3,2018-11-28,3,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000,2
4,2018-11-29,3,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000,2


---

### 3.6.3 Replacing the contents of a column


1. if a column already exists, use the `[]` operator will replace its contents.

In [19]:
df2['Test'] = pd.Series('Test', index=df2.index)
df2.head()

Unnamed: 0,Date,Test2,Open,High,Low,Close,Adj_close,Volume,Test
0,2018-11-23,3,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000,Test
1,2018-11-26,3,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000,Test
2,2018-11-27,3,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000,Test
3,2018-11-28,3,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000,Test
4,2018-11-29,3,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000,Test


---

### 3.6.4 Deleting columns in a DataFrame


1. `del` will simply delete the Series from the DataFrame (in-place) 

2. `pop()` will both delete the Series and return the Series as a result (also in-place) 

3. `drop(labels, axis=1)` will return a new DataFrame with the column(s) removed (the original DataFrame object is not modified)

In [20]:
del(df2['Test2'])
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj_close,Volume,Test
0,2018-11-23,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000,Test
1,2018-11-26,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000,Test
2,2018-11-27,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000,Test
3,2018-11-28,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000,Test
4,2018-11-29,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000,Test


In [21]:
df2.pop('Test')
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj_close,Volume
0,2018-11-23,6919.52002,6987.890137,6919.160156,6938.97998,6938.97998,958950000
1,2018-11-26,7026.5,7083.930176,7003.120117,7081.850098,7081.850098,2011180000
2,2018-11-27,7041.22998,7105.140137,7014.359863,7082.700195,7082.700195,2067360000
3,2018-11-28,7135.080078,7292.709961,7090.97998,7291.589844,7291.589844,2390260000
4,2018-11-29,7267.370117,7319.959961,7217.689941,7273.080078,7273.080078,1983460000


In [22]:
df3 = df2.drop('Close', axis=1)
print(df2.columns)
print(df3.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj_close', 'Volume'], dtype='object')
Index(['Date', 'Open', 'High', 'Low', 'Adj_close', 'Volume'], dtype='object')


---

### 3.6.5 Adding rows to a DataFrame

Rows can be added to a DataFrame object via several different operations: 

1. Appending rows with `.append()`

	1. no alignment is performmed, so duplicate index may occur;
    
	2. returns a new DataFrame with the data from the original DataFrame added first, and the rows from the second;

	3. The set of columns of the DataFrames do not need to be the same.
    

2. Concatenating DataFrame objects with `pd.concat()`:

	1. The result will have duplicate columns;


3. Adding rows (and columns) via setting with enlargement

	1. Rows can also be added to a DataFrame through the `.loc` property.

	2. The parameter for `.loc` specifes the index label where the row is to be placed.

	3. It updates data in-place, as `.loc` not only retrieves row(s), but also lets you modify the results that are returned.

	4. we use the colon in the rows' position to select all rows to be included to add the new column and value.

In [23]:
df3 = df1.append(df2, sort=False)
df3.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Adj_close
count,40.0,40.0,40.0,40.0,20.0,40.0,20.0
mean,7036.433984,7094.115039,6932.403003,6996.186035,6996.186035,2492132000.0,6996.186035
std,233.718336,231.177982,264.673372,274.823281,278.415894,657921600.0,278.415894
min,6573.490234,6586.680176,6304.629883,6333.0,6333.0,958950000.0,6333.0
25%,6911.255005,6973.870117,6842.670166,6878.972656,6878.972656,2186262000.0,6878.972656
50%,7033.86499,7117.485108,6983.674805,7051.080078,7051.080078,2443730000.0,7051.080078
75%,7142.332397,7227.205078,7092.375,7165.887574,7165.887574,2643168000.0,7165.887574
max,7486.129883,7486.509766,7392.220215,7441.509766,7441.509766,4534120000.0,7441.509766


In [24]:
df3 = pd.concat([df1, df2], axis=0, sort=False)
df3.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Adj_close
count,40.0,40.0,40.0,40.0,20.0,40.0,20.0
mean,7036.433984,7094.115039,6932.403003,6996.186035,6996.186035,2492132000.0,6996.186035
std,233.718336,231.177982,264.673372,274.823281,278.415894,657921600.0,278.415894
min,6573.490234,6586.680176,6304.629883,6333.0,6333.0,958950000.0,6333.0
25%,6911.255005,6973.870117,6842.670166,6878.972656,6878.972656,2186262000.0,6878.972656
50%,7033.86499,7117.485108,6983.674805,7051.080078,7051.080078,2443730000.0,7051.080078
75%,7142.332397,7227.205078,7092.375,7165.887574,7165.887574,2643168000.0,7165.887574
max,7486.129883,7486.509766,7392.220215,7441.509766,7441.509766,4534120000.0,7441.509766


In [25]:
df3 = df2[['Open', 'High']].copy()

df3.loc['2019-01-01 00:00:00'] = ['8000', '9000']
df3.tail()

Unnamed: 0,Open,High
16,6809.82,6847.27
17,6777.59,6868.86
18,6607.76,6666.2
19,6573.49,6586.68
2019-01-01 00:00:00,8000.0,9000.0


---

### 3.6.6 Removing rows from a DataFrame

Removing rows from a DataFrame object is normally performed using one of three techniques: 

1. Using the `.drop()` method 
2. Boolean selection 
3. Selection using a slice

    1. Remember, that this result is a slice. Therefore, it is a view into the DataFrame.

	2. To prevent this from occurring, the proper action  is to make a copy of the slice

In [26]:
df3.drop('2019-01-01 00:00:00', axis=0, inplace=True)
df3.tail()

Unnamed: 0,Open,High
15,6886.46,6931.81
16,6809.82,6847.27
17,6777.59,6868.86
18,6607.76,6666.2
19,6573.49,6586.68


In [27]:
df3[df3['Open'] > 7000]

Unnamed: 0,Open,High
1,7026.5,7083.93
2,7041.23,7105.14
3,7135.08,7292.71
4,7267.37,7319.96
5,7279.3,7332.79
6,7486.13,7486.51
7,7407.95,7421.11
8,7017.05,7189.52
9,7163.49,7205.37
11,7121.66,7129.83


In [28]:
df4 = df3[0:15].copy()
df4._is_view

False

---

## 3.7 Exercises

1. What do `axis=1` and `axis=0` mean? What functions in DataFrame have these two options?

2. What do `inplace=True` and `inplace=False` mean? What functions in DataFrame have these two options?

3. Build a `pandas` function: 
    
    1. check input data
    2. replace the spaces in column names.
    
---

Solutions to 3:

<img src="../image/columns.png">


In [18]:
import pandas as pd

def fix_col(df):
    rep_dic = {column: column.replace(' ', '_') for column in df.columns if ' ' in column}
    return df.rename(rep_dic, axis='columns') if rep_dic else df

df = pd.read_csv('../data/nasdaq.csv')
print(df.columns)

new = fix_col(df)
new.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume'], dtype='object')

---