# DataFrames with 2-dimensional data 

## DataFrame common methods

In [20]:
import pandas as pd

In [21]:
nba = pd.read_csv("~/Projects/DataAnalysisPandas/Data/nba.csv")
nba.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


### Select One, Two or More Columns from a DataFrame

In [22]:
# you can call a column by the dot operator on a dataframe 
# however the column name cannot have spaces, otherwise you will need to use backets
# selecting one column will return a series
nba.Name.head(5)

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
Name: Name, dtype: object

In [23]:
# to call mulitple columns, use backets
# whenever you select more than one column, it will return a new dataframe
# you can also reorder the columns
nba[['Name','Team']].head(5)

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics


### Add New Column to DataFrame

In [24]:
# there a two main ways of adding a new column 
# 1. set the new column equal to the columns value
# the new column will be added to the end of the dataframe
nba["New Column"] = "new column data"
nba.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,New Column
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,new column data
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,new column data
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,new column data
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,new column data
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,new column data


In [25]:
# 2. insert the new column positional in a dataframe 
nba.insert(loc = 2, column = "new column data 2", value = "new column data 2")
nba.head(5)

Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column
0,Avery Bradley,Boston Celtics,new column data 2,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,new column data
1,Jae Crowder,Boston Celtics,new column data 2,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,new column data
2,John Holland,Boston Celtics,new column data 2,30.0,SG,27.0,6-5,205.0,Boston University,,new column data
3,R.J. Hunter,Boston Celtics,new column data 2,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,new column data
4,Jonas Jerebko,Boston Celtics,new column data 2,8.0,PF,29.0,6-10,231.0,,5000000.0,new column data


### Drop Rows with Null Values

In [26]:
# dropna() removes any rows from DataFrame that have NaN by default.
# you can utilize the 'how' parameter to choose whether to remove rows/columns with any or all NaNs

nba.dropna(axis = "columns", how = "all", inplace = True)
nba.head()

# use the subset parameter to apply the dropna() method on a specific subset of columns


Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column
0,Avery Bradley,Boston Celtics,new column data 2,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,new column data
1,Jae Crowder,Boston Celtics,new column data 2,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,new column data
2,John Holland,Boston Celtics,new column data 2,30.0,SG,27.0,6-5,205.0,Boston University,,new column data
3,R.J. Hunter,Boston Celtics,new column data 2,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,new column data
4,Jonas Jerebko,Boston Celtics,new column data 2,8.0,PF,29.0,6-10,231.0,,5000000.0,new column data


### Fill in Null Values with the .fillna() Method

In [30]:
# the .fillna() will apply the supplied value to the entire dataframe.
# it's better to use .fillna() on the series attatched to the dataframe

nba['College'].fillna('No College', inplace = True)
nba['Age'].fillna(0.0, inplace = True)

### The .astype() Method

In [33]:
# the .astype() method can be used to convert a column (series) from one type to another
# unfortunately the .astype() does not have an inplace parameter so you will have to assign it a new column
nba['Age'] = nba['Age'].astype('int')
nba.head(5)

Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column
0,Avery Bradley,Boston Celtics,new column data 2,0.0,PG,25,6-2,180.0,Texas,7730337.0,new column data
1,Jae Crowder,Boston Celtics,new column data 2,99.0,SF,25,6-6,235.0,Marquette,6796117.0,new column data
2,John Holland,Boston Celtics,new column data 2,30.0,SG,27,6-5,205.0,Boston University,,new column data
3,R.J. Hunter,Boston Celtics,new column data 2,28.0,SG,22,6-5,185.0,Georgia State,1148640.0,new column data
4,Jonas Jerebko,Boston Celtics,new column data 2,8.0,PF,29,6-10,231.0,No College,5000000.0,new column data


In [35]:
# the .astype() can help save on memory by creating a category object.
# for example- instead of storing the same string 100 times, create a category 
# which creates an object to that string so it's only referenced once instead of 100 times.
nba['Position'] = nba['Position'].astype('category')

### Sort a DataFrame withthe .sort_values() Method

In [37]:
# you can sort values on a column and place the Nan values first or last 
# the Nan rule will be applied and then the sort will happen
nba.sort_values('Salary', na_position = 'first').head(5)

Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column
2,John Holland,Boston Celtics,new column data 2,30.0,SG,27,6-5,205.0,Boston University,,new column data
46,Elton Brand,Philadelphia 76ers,new column data 2,42.0,PF,37,6-9,254.0,Duke,,new column data
171,Dahntay Jones,Cleveland Cavaliers,new column data 2,30.0,SG,35,6-6,225.0,Duke,,new column data
264,Jordan Farmar,Memphis Grizzlies,new column data 2,4.0,PG,29,6-2,180.0,UCLA,,new column data
269,Ray McCallum,Memphis Grizzlies,new column data 2,5.0,PG,24,6-3,190.0,Detroit,,new column data


In [40]:
# sorting mulitple columns at once 
# you can use different acending/descending rules with multiple column sort
# remember you will need to sort the index after you do any sorting on values

nba.sort_values(['Team', 'Name'], ascending = [True, False]).head(5)


Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column
322,Walter Tavares,Atlanta Hawks,new column data 2,22.0,C,24,7-3,260.0,No College,1000000.0,new column data
310,Tim Hardaway Jr.,Atlanta Hawks,new column data 2,10.0,SG,24,6-6,205.0,Michigan,1304520.0,new column data
321,Tiago Splitter,Atlanta Hawks,new column data 2,11.0,C,31,6-11,245.0,No College,9756250.0,new column data
320,Thabo Sefolosha,Atlanta Hawks,new column data 2,25.0,SF,32,6-7,220.0,No College,4000000.0,new column data
315,Paul Millsap,Atlanta Hawks,new column data 2,4.0,PF,31,6-8,246.0,Louisiana Tech,18671659.0,new column data


### Rank Values with the .rank() Method

In [45]:
# you can assign a rank to all values in a column
# rank can be use to later categorize or group rankings together
nba['Salary'] = nba['Salary'].fillna(0).astype('int')
nba['Salary Rank'] = nba['Salary'].rank(ascending = False).astype('int')
nba.head(5)

Unnamed: 0,Name,Team,new column data 2,Number,Position,Age,Height,Weight,College,Salary,New Column,Salary Rank
0,Avery Bradley,Boston Celtics,new column data 2,0.0,PG,25,6-2,180.0,Texas,7730337,new column data,97
1,Jae Crowder,Boston Celtics,new column data 2,99.0,SF,25,6-6,235.0,Marquette,6796117,new column data,110
2,John Holland,Boston Celtics,new column data 2,30.0,SG,27,6-5,205.0,Boston University,0,new column data,452
3,R.J. Hunter,Boston Celtics,new column data 2,28.0,SG,22,6-5,185.0,Georgia State,1148640,new column data,322
4,Jonas Jerebko,Boston Celtics,new column data 2,8.0,PF,29,6-10,231.0,No College,5000000,new column data,147
