# **DataFrame Basics III**

## **Sorting dataframes**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.age.sort_values()

In [None]:
titanic.sort_values(by = 'age')

In [None]:
titanic

In [None]:
titanic.sort_values(by = 'age', inplace = True)

In [None]:
titanic.head()

In [None]:
titanic.sort_index(inplace = True)

In [None]:
titanic.head()

In [None]:
titanic.sort_values(by = ['age', 'pclass'], inplace = True)

In [None]:
titanic.head(20)

In [None]:
titanic.sort_index(inplace = True)

In [None]:
titanic.sort_values(by = ['age', 'pclass', 'sex'], ascending = [True, True, False], inplace = True)

In [None]:
pd.options.display.max_rows = 900

In [None]:
titanic

In [None]:
titanic.sort_index()

In [None]:
pd.options.display.max_rows = 10

In [None]:
titanic

## **Ranking DataFrames**

In [None]:
import pandas as pd

In [None]:
sales = pd.Series([15, 32, 45, 21, 55, 15, 0], index = ["Mon", "Tue", "Wen", "Thu", "Fri", "Sat", "Sun"])

In [None]:
sales

In [None]:
sales.sort_values(ascending = False)

In [None]:
sales.rank(method = 'max', ascending = False)

In [None]:
sales.rank(method = 'first', ascending = False).sort_values()

In [None]:
sales.rank(method = 'average', ascending = False).sort_values()

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.fare.sort_values(ascending = False)

In [None]:
titanic.fare.rank(ascending = False)

In [None]:
titanic.fare.rank(ascending = True, method = 'first').sort_values(ascending = False)

In [None]:
fare_rank = titanic.fare.rank(ascending = False, method = 'min')

In [None]:
titanic.insert(column = 'fare_rank', value = fare_rank, loc = 7)

In [None]:
titanic.head()

In [None]:
titanic.sort_values(by = 'fare_rank')

## **nunique() and nlargest() / nsmallest() with DataFrames**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.tail()

#### **nunique()**

In [None]:
titanic.age.unique()

In [None]:
titanic.nunique(axis = 0)

#### **nlargest()**

In [None]:
titanic.nlargest(columns = 'age', n = 5)

In [None]:
titanic.sort_values(by = 'age', ascending = False).head()

#### **nsmallest()**

In [None]:
titanic.nsmallest(n = 5, columns = 'age')

In [None]:
titanic.sort_values(by = 'age').head()

In [None]:
titanic.loc[titanic.age.idxmin()]

## **Summary statistics and accumulations**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.describe()

In [None]:
titanic.count()

In [None]:
titanic.max(numeric_only = True)

In [None]:
titanic.min(numeric_only = True)

In [None]:
titanic.mean(numeric_only = True)

In [None]:
titanic.sum(numeric_only = True)

In [None]:
titanic.age.cumsum()

In [None]:
titanic.corr(numeric_only = True)

In [None]:
titanic.survived.corr(titanic.fare)

## **The agg() method**

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [4]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
titanic.agg('mean', numeric_only = True) # works but it is not recommended

survived     0.383838
pclass       2.308642
age         29.699118
sibsp        0.523008
parch        0.381594
fare        32.204208
dtype: float64

In [6]:
titanic.select_dtypes('number').agg('mean')

survived     0.383838
pclass       2.308642
age         29.699118
sibsp        0.523008
parch        0.381594
fare        32.204208
dtype: float64

In [7]:
titanic.select_dtypes('number').agg(['mean', 'sum', 'max'])

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
sum,342.0,2057.0,21205.17,466.0,340.0,28693.9493
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
titanic.select_dtypes('number').agg({'age': ['min', 'max'], 'fare': ['mean', 'sum']})

Unnamed: 0,age,fare
min,0.42,
max,80.0,
mean,,32.204208
sum,,28693.9493


In [9]:
def dummy_func(n):
    return 2*n

In [10]:
titanic.select_dtypes('number').agg({'age': ['min', 'max'], 'fare': ['mean', 'sum'], 'fare': dummy_func})

  titanic.select_dtypes('number').agg({'age': ['min', 'max'], 'fare': ['mean', 'sum'], 'fare': dummy_func})


Unnamed: 0_level_0,age,fare
Unnamed: 0_level_1,age,dummy_func
min,0.42,
max,80.00,
0,,14.5000
1,,142.5666
2,,15.8500
...,...,...
886,,26.0000
887,,60.0000
888,,46.9000
889,,60.0000


In [11]:
titanic.select_dtypes('number').transform(dummy_func)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,6,44.0,2,0,14.5000
1,2,2,76.0,2,0,142.5666
2,2,6,52.0,0,0,15.8500
3,2,2,70.0,2,0,106.2000
4,0,6,70.0,0,0,16.1000
...,...,...,...,...,...,...
886,0,4,54.0,0,0,26.0000
887,2,2,38.0,0,0,60.0000
888,0,6,,2,4,46.9000
889,2,2,52.0,0,0,60.0000
