* pandas introduces two new data structures to Python - Series and DataFrame
- SERIES: A Series is a one-dimensional object similar to an array, list, or column in a table. It will assign a labeled index to each item in the Series. By default, each item will receive an index label from 0 to N, where N is the length of the Series minus one.

* DATAFRAME: A DataFrame is a tablular data structure comprised of rows and columns, akin to a spreadsheet, database  table,
 a DataFrame as a group of Series objects that share an index (the column names).


In [43]:
import numpy as np
import pandas as pd

In [44]:
data = { 'game_id': np.linspace(100,200, 8, dtype=np.int32),
        'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['game_id','year', 'team', 'wins', 'losses'])
# OR
football = pd.read_csv('football.csv')
football

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,NODATA,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [45]:
football.loc[1] # access one row by index (NOTE : we have to use loc[] for accessing by index)

game_id        NaN
year          2010
team       Rockets
wins            13
losses           6
Name: 1, dtype: object

In [46]:
football.loc[[1,5,7]] # accessing more than one row by specifying list

Unnamed: 0,game_id,year,team,wins,losses
1,,2010.0,Rockets,13.0,6.0
5,167.0,,Rockets,12.0,3.0
7,187.0,2012.0,Rockets,13.0,


In [47]:
# access the columns we can use [] without loc
football['year']

0     2010.0
1     2010.0
2     2011.0
3     2012.0
4     2012.0
5        NaN
6     2011.0
7     2012.0
8     2012.0
9     2010.0
10    2010.0
11    2011.0
12    2012.0
Name: year, dtype: float64

In [48]:
# access multiple columns we have to specify list inside []
football[['year', 'team']]

Unnamed: 0,year,team
0,2010.0,Bears
1,2010.0,Rockets
2,2011.0,Bears
3,2012.0,NODATA
4,2012.0,Bears
5,,Rockets
6,2011.0,Packers
7,2012.0,Rockets
8,2012.0,Packers
9,2010.0,Packers


In [49]:
# We can also use iloc for accessing columns using position index for columns
football.iloc[:, [1,2]]

Unnamed: 0,year,team
0,2010.0,Bears
1,2010.0,Rockets
2,2011.0,Bears
3,2012.0,NODATA
4,2012.0,Bears
5,,Rockets
6,2011.0,Packers
7,2012.0,Rockets
8,2012.0,Packers
9,2010.0,Packers


In [50]:
# We can slice both rows and columns using position index (numpy way) but we have to use iloc
football.iloc[:3, 1:3]

Unnamed: 0,year,team
0,2010.0,Bears
1,2010.0,Rockets
2,2011.0,Bears


In [51]:
# Most DataFrame methods return new a DataFrames, while offering an inplace parameter. 
football.set_index('game_id') # change the index. This wont change  the current dataframe

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,11.0,5.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,9.0,3.0
128.0,2012.0,Bears,10.0,6.0
167.0,,Rockets,12.0,3.0
142.0,2011.0,Packers,15.0,1.0
187.0,2012.0,Rockets,13.0,
157.0,2012.0,Packers,11.0,5.0
187.0,2010.0,Packers,,5.0


In [52]:
football.set_index('game_id', inplace=True) # by specifying inplace we change this instance

In [53]:
football

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,11.0,5.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,9.0,3.0
128.0,2012.0,Bears,10.0,6.0
167.0,,Rockets,12.0,3.0
142.0,2011.0,Packers,15.0,1.0
187.0,2012.0,Rockets,13.0,
157.0,2012.0,Packers,11.0,5.0
187.0,2010.0,Packers,,5.0


In [54]:
# Access using new index
football.loc[[157, 114, 200]]

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
157.0,2012.0,Packers,11.0,5.0
114.0,2011.0,Bears,8.0,8.0
200.0,2012.0,Lions,4.0,12.0


In [55]:
# using position index for columns
football.iloc[:3, 1:3]

Unnamed: 0_level_0,team,wins
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100.0,Bears,11.0
,Rockets,13.0
114.0,Bears,8.0


In [56]:
'''Assignments'''
# Simple one specific location
football.loc[100, 'wins'] = 12
football

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,12.0,5.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,9.0,3.0
128.0,2012.0,Bears,10.0,6.0
167.0,,Rockets,12.0,3.0
142.0,2011.0,Packers,15.0,1.0
187.0,2012.0,Rockets,13.0,
157.0,2012.0,Packers,11.0,5.0
187.0,2010.0,Packers,,5.0


In [58]:
# multiple rows
football.loc[[100, 200], ['wins','losses'] ]  = 0
football

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,0.0,0.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,9.0,3.0
128.0,2012.0,Bears,10.0,6.0
167.0,,Rockets,12.0,3.0
142.0,2011.0,Packers,15.0,1.0
187.0,2012.0,Rockets,13.0,
157.0,2012.0,Packers,11.0,5.0
187.0,2010.0,Packers,,5.0


In [59]:
# multiple rows & columns
football.loc[[145,187], ['wins', 'losses']] = 10
football

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100.0,2010.0,Bears,0.0,0.0
,2010.0,Rockets,13.0,6.0
114.0,2011.0,Bears,8.0,8.0
145.0,2012.0,NODATA,10.0,10.0
128.0,2012.0,Bears,10.0,6.0
167.0,,Rockets,12.0,3.0
142.0,2011.0,Packers,15.0,1.0
187.0,2012.0,Rockets,10.0,10.0
157.0,2012.0,Packers,11.0,5.0
187.0,2010.0,Packers,10.0,10.0


In [42]:
''' NEVER DO ASSIGNMENT LIKE THIS.
This will result in a warning and unintended operation as original dataframe may or may not get modified
but a copy is modified. 
'''
football.loc[100]['losses'] = 0 
# OR
football.loc[[100,200]]['losses'] = 10

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
