# Chapter 1: Pandas DataFrame Basics

In [None]:
import pandas as pd

## 1.2 Load your first dataset

### Import GapMinder Dataset

In [None]:
df = pd.read_csv(r"F:\book files\pandas_for_everyone\data\gapminder.tsv",
                  sep="\t")
df

In [None]:
print("Shape\n", df.shape, "\n")
print("Columns\n", df.columns, "\n")
print("Data Types\n", df.dtypes, "\n")

In [None]:
df.info()

## 1.3 Look at Columns, Rows, and Cells

In [None]:
df.head(n=5)

#### 1.3.1 Select and Subset Columns by Name



In [None]:
country_df = df[["country"]]
country_df.head(n=5)

In [None]:
country_df.tail(n=5)

In [None]:
subset = df[["country", "continent", "year"]]
subset

##### 1.3.1.1 Single Value Returns DataFrame or Series

In [None]:
country_df = df['country']
print(country_df)
type(country_df)

In [None]:
country_df_list = df[['country']]
print(country_df_list)
type(country_df_list)

##### 1.3.1.2 Using Dot Notation to Pull a Column of Values

The following two commands give identical results.

In [None]:
print(df['country'])
print(type(df['country']))
print(df.country)
print(type(df.country))

#### 1.3.2 Subset Rows
##### 1.3.2.1 Subset Rows by index Label - .loc[]

In [None]:
df

Get the first row

In [None]:
df.loc[0]

Get the 100th row of the DataFrame using `.loc[]`

In [None]:
df.loc[99]

Multiple ways to get the last row of a DataFrame

In [None]:
number_of_rows = df.shape[0]
last_row_index = number_of_rows - 1
df.loc[last_row_index]

In [None]:
df.tail(n=1)

The loc method gives a series and the head method gives a DataFrame.

In [None]:
subset_loc = df.loc[0]
print(type(subset_loc))
subset_head = df.head(n=1)
print(type(subset_head))

##### 1.3.2.2 Subsetting Multiple Rows

In [None]:

df.loc[[0, 99, 999]]

#### 1.3.3 Subset Rows by Row Number: .iloc[]

Get the first and 100th rows

In [None]:
print("First Row \n", df.iloc[0], "\n")
print("100th Row \n", df.iloc[99], "\n")

Get the last row

In [None]:
df.iloc[-1]

Get the first, 100th, 1000th, and last rows

In [None]:
df.iloc[[0, 99, 999, -1]]

#### 1.3.4 Mix it Up
##### 1.3.4.1 Selecting Columns

In [None]:
subset = df.loc[:, ["year", "pop"]]
subset

In [None]:
subset = df.iloc[:, [2, 4, -1]]
subset

In [None]:
subset = df.loc[:, [2, 4, -1]]
print(subset)

In [None]:
subset = df.iloc[:, [2, 4, -1]]
print(subset)

##### 1.3.4.2 Subsetting with range()

In [None]:
small_range = list(range(5))
small_range

In [None]:
df.columns

In [None]:
subset = df.iloc[:, small_range]
subset

In [None]:
small_range = list(range(3, 6))
print(small_range)
subset = df.iloc[:, small_range]
subset

In [None]:
small_range = list(range(0, 6, 2))
print(small_range)
subset = df.iloc[:, small_range]
subset

##### 1.3.4.3 Subsetting with Slicing :

In [None]:
df.columns

In [None]:
small_range = list(range(3))
print(small_range)
subset = df.iloc[:, small_range]
subset

In [None]:
df.iloc[:, :3]

In [None]:
small_range = list(range(3, 6))
print(small_range)
subset = df.iloc[:, small_range]
subset

In [None]:
df.iloc[:, 3:6]

In [None]:
small_range = list(range(0, 6, 2))
print(small_range)
subset = df.iloc[:, small_range]
subset

In [None]:
df.iloc[:, 0:6:2]

#### 1.3.5 Subset Rows and Columns

In [None]:
df.loc[42, 'country']

In [None]:
df.iloc[42, 0]

In [None]:
df.loc[42, 0]

##### 1.3.5.1 Subsetting Multipe Rows and Columns

In [None]:
df.iloc[[0, 99, 999, -1], [0, 3, 5]]

In [None]:
df.loc[[0, 99, 999], ['country', 'lifeExp', 'gdpPercap']]

## 1.4 Grouped and Aggregated Calculations

In [None]:
df

### 1.4.1 Grouped Means

In [None]:
summary = df.groupby('year')['lifeExp'].mean()
print(type(summary))
summary

In [None]:
print(df.groupby('year'))
print(df.groupby('year')['lifeExp'])
print(df.groupby('year')['lifeExp'].mean())

In [None]:
df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].mean()

In [None]:
multi_group_var = df\
    .groupby(['year', 'continent'])\
    [['lifeExp', 'gdpPercap']]\
    .mean()

print(type(multi_group_var))
print(multi_group_var)


This is really useful notation for chaining together multiple operations.

In [None]:
multi_group_var = (
    df
    .groupby(['year', 'continent'])
    [['lifeExp', 'gdpPercap']]
    .mean()
)
print(multi_group_var)

In [None]:
flat = multi_group_var.reset_index()
print(flat)

### 1.4.2 Grouped Frequency Counts

In [None]:
df.groupby('continent')['country'].nunique()

## 1.5 Basic Plot

In [None]:
global_yearly_life_expectancy = df.groupby('year')['lifeExp'].mean()
print(global_yearly_life_expectancy)

In [None]:
import matplotlib.pyplot as plt
global_yearly_life_expectancy.plot()
plt.xlabel('Year')
plt.ylabel('Life Expectancy')
plt.title('Global Yearly Life Expectancy')
plt.show()