# The Data Frame

The DataFrame is a tabular data structure very similar to the Spreadsheet (the most familiar are Excel spreadsheets). This data structure is designed to extend the case of the Series to multiple dimensions. In fact, the DataFrame consists of an ordered collection of columns, each of which can contain a value of different type (numeric, string, Boolean, etc.). <br>
![image.png](attachment:image.png)

In [2]:
import numpy as np
import pandas as pd

---

## Defining a DataFrame

In [23]:
data = {'color' : ['blue','green','yellow','red','white'],
        'object' : ['ball','pen','pencil','paper','mug'],
        'price' : [1.2,1.0,0.6,0.9,1.7]}
frame = pd.DataFrame(data)

In [24]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [25]:
list(frame.index)

[0, 1, 2, 3, 4]

In [26]:
list(frame.columns)

['color', 'object', 'price']

In [27]:
frame1 = pd.DataFrame(data, index=range(1, 6))

In [28]:
frame1

Unnamed: 0,color,object,price
1,blue,ball,1.2
2,green,pen,1.0
3,yellow,pencil,0.6
4,red,paper,0.9
5,white,mug,1.7


In [29]:
frame2 = pd.DataFrame(data, columns=['color', 'price'])

In [30]:
frame2

Unnamed: 0,color,price
0,blue,1.2
1,green,1.0
2,yellow,0.6
3,red,0.9
4,white,1.7


In [31]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [32]:
type(frame.index)

pandas.core.indexes.range.RangeIndex

In [33]:
type(frame.columns)

pandas.core.indexes.base.Index

In [36]:
frame3 = pd.DataFrame(data, index=list('abcde'))

In [37]:
frame3

Unnamed: 0,color,object,price
a,blue,ball,1.2
b,green,pen,1.0
c,yellow,pencil,0.6
d,red,paper,0.9
e,white,mug,1.7


In [38]:
frame4 = pd.DataFrame(np.random.randint(100, 200, (3, 3)), 
                     index=[0, 1, 2], 
                      columns=['col1', 'col2', 'col3'])

In [39]:
frame4

Unnamed: 0,col1,col2,col3
0,117,120,158
1,158,139,111
2,190,108,139


In [40]:
ser0 = pd.Series(np.random.randint(100, 200, 10))
ser1 = pd.Series(np.random.randn(10))

In [45]:
ser1

0   -0.160265
1   -1.886158
2    1.240339
3    0.660729
4    1.468705
5    0.587150
6    0.267705
7   -0.668544
8    0.943668
9   -0.298977
dtype: float64

In [54]:
frame5 = pd.DataFrame([ser0, ser1]).T

In [56]:
frame5.columns = ['col1', 'col2']

## Selecting Elements

In [60]:
frame5.values

array([[ 1.78000000e+02, -1.60265110e-01],
       [ 1.06000000e+02, -1.88615778e+00],
       [ 1.48000000e+02,  1.24033947e+00],
       [ 1.00000000e+02,  6.60729481e-01],
       [ 1.00000000e+02,  1.46870536e+00],
       [ 1.75000000e+02,  5.87150153e-01],
       [ 1.34000000e+02,  2.67705087e-01],
       [ 1.98000000e+02, -6.68544106e-01],
       [ 1.41000000e+02,  9.43668343e-01],
       [ 1.60000000e+02, -2.98977355e-01]])

In [61]:
frame.dtypes

color      object
object     object
price     float64
dtype: object

In [62]:
frame['price']

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [63]:
frame.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [64]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [66]:
frame.iloc[2]

color     yellow
object    pencil
price        0.6
Name: 2, dtype: object

In [67]:
frame.loc[:, 'color']

0      blue
1     green
2    yellow
3       red
4     white
Name: color, dtype: object

In [68]:
frame['color']

0      blue
1     green
2    yellow
3       red
4     white
Name: color, dtype: object

In [69]:
frame.loc[:, 'color'] is frame['color']

True

In [70]:
frame.loc[0:2, ['color', 'object']]

Unnamed: 0,color,object
0,blue,ball
1,green,pen
2,yellow,pencil


In [71]:
frame.index.name = 'id'
frame.columns.name = 'item'

In [72]:
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [78]:
frame['number'] = np.random.randint(100, 200, 5)

In [79]:
frame

item,color,object,price,number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,134
1,green,pen,1.0,141
2,yellow,pencil,0.6,128
3,red,paper,0.9,107
4,white,mug,1.7,156


In [85]:
frame6 = pd.DataFrame(np.random.randint(100, 200, (10, 4)))
frame6

Unnamed: 0,0,1,2,3
0,135,129,114,180
1,105,122,198,199
2,107,116,125,149
3,104,149,134,104
4,166,183,109,134
5,109,176,195,115
6,192,140,114,151
7,173,150,145,174
8,173,196,160,118
9,109,116,164,113


In [83]:
frame6 < 150

Unnamed: 0,0,1,2,3
0,True,False,True,False
1,False,False,False,True
2,True,False,True,True
3,False,True,False,False
4,False,False,False,True
5,False,True,True,True
6,True,False,False,False
7,True,True,False,False
8,False,True,False,False
9,True,True,False,True


In [86]:
frame6.isin([150])

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,True,False,False
8,False,False,False,False
9,False,False,False,False


In [87]:
frame

item,color,object,price,number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,134
1,green,pen,1.0,141
2,yellow,pencil,0.6,128
3,red,paper,0.9,107
4,white,mug,1.7,156


In [88]:
frame.isin(['green', 'paper', 0.9])

item,color,object,price,number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,True,False,False,False
2,False,False,False,False
3,False,True,True,False
4,False,False,False,False


In [89]:
frame[frame.isin(['green', 'paper', 0.9])]

item,color,object,price,number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,green,,,
2,,,,
3,,paper,0.9,
4,,,,


In [90]:
frame['new'] = 10

In [91]:
frame

item,color,object,price,number,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,blue,ball,1.2,134,10
1,green,pen,1.0,141,10
2,yellow,pencil,0.6,128,10
3,red,paper,0.9,107,10
4,white,mug,1.7,156,10


## Deleting a Column

In [92]:
del frame['new']

In [93]:
frame

item,color,object,price,number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,134
1,green,pen,1.0,141
2,yellow,pencil,0.6,128
3,red,paper,0.9,107
4,white,mug,1.7,156


## Filtering

In [94]:
frame6

Unnamed: 0,0,1,2,3
0,135,129,114,180
1,105,122,198,199
2,107,116,125,149
3,104,149,134,104
4,166,183,109,134
5,109,176,195,115
6,192,140,114,151
7,173,150,145,174
8,173,196,160,118
9,109,116,164,113


In [98]:
frame6[frame6 < 150]

Unnamed: 0,0,1,2,3
0,135.0,129.0,114.0,
1,105.0,122.0,,
2,107.0,116.0,125.0,149.0
3,104.0,149.0,134.0,104.0
4,,,109.0,134.0
5,109.0,,,115.0
6,,140.0,114.0,
7,,,145.0,
8,,,,118.0
9,109.0,116.0,,113.0


In [100]:
nestdict = {'red': { 2012: 22, 2013: 33 },
             'white': { 2011: 13, 2012: 22, 2013: 16},
             'blue': {2011: 17, 2012: 27, 2013: 18}}

In [101]:
frame7 = pd.DataFrame(nestdict)

In [102]:
frame7

Unnamed: 0,red,white,blue
2011,,13,17
2012,22.0,22,27
2013,33.0,16,18


In [103]:
frame8 = pd.DataFrame([pd.Series(np.random.randint(0, 10, 10)),
                       pd.Series(np.random.randint(100, 200, 10))])

In [108]:
frame8 = frame8.T

In [109]:
frame8

Unnamed: 0,0,1
0,9,109
1,4,174
2,7,181
3,9,182
4,0,122
5,3,169
6,5,150
7,9,167
8,4,117
9,9,161


---

# The Index Objects

The Index objects are responsible for the labels on the axes and other metadata as the name of the axes. Index object is immutable.

In [110]:
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
ser.index

Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')

## Methods on Index

In [111]:
ser.idxmin()

'blue'

In [112]:
ser.idxmax()

'white'

## Index with Duplicate Labels

In [113]:
serd = pd.Series(range(6), index=['white','white','blue','green','green','yellow'])

In [114]:
serd

white     0
white     1
blue      2
green     3
green     4
yellow    5
dtype: int64

In [115]:
serd['white']

white    0
white    1
dtype: int64

In [116]:
serd.loc['white']

white    0
white    1
dtype: int64

In [118]:
serd.index.is_unique

False

In [119]:
serd.is_unique

True

---

# Important Points

- DataFrame creation:
    - from numpy object
    - from series
    - from dictionary, key-column name, values-list, array, series
    - from dictionary, outer key-column name, inner key-index label
- In constructor we can specify column names, index labels
- Indexing: 
    - A['column_name']
    - A['row', 'column']
    - A.loc['row_label', 'column_label']
    - A.iloc[row_number, column_number]
- Adding new column by A['new_column_name']
- isnull, notnull, isin, same as numpy, return boolean object
- We can specify object.index.name, object.columns.name
- object.index, return index labels of DataFrame
- object.columns, return column lables of DataFrame
- object.idxmin, return minimum index
- object.idxmax, return maximum index
- object.index.is_unique, return boolean value
- object.is_unique, return boolean value