<a href="https://colab.research.google.com/github/AndreGulyi/Pandas_for_everyone/blob/main/Pandas_for_everyone_Part_1_Introduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Pandas DataFrame Basics

##1.2 Load Your First Data Set

In [None]:
#The Gapminder data set originally comes from https://www.gapminder.org/.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DataScience/AG:Pandas_for_everyone/data/gapminder.tsv', sep='\t')

## 1.3 Look at Columns, Rows, and Cells

In [None]:
df.head()

In [None]:
#get type
print(type(df))

In [None]:
# get the number of rows and columns
print(df.shape)

In [None]:
#get column names
print(df.columns)

In [None]:
# get the dtype of each column
print(df.dtypes)

In [None]:
# get more information about our data
print(df.info())

In [None]:
# show the first 5 observations
print(df.head())

In [None]:
# just get the country column and save it to its own variable
country_df = df['country']
print(country_df.head())

In [None]:
# show the last 5 observations
print(country_df.tail())

In [None]:
# Looking at country, continent, and year
subset = df[['country', 'continent', 'year']]
print(subset.head())

In [None]:
print(type(country_df))

In [None]:
country_df_list = df[['country']] # note the double square bracket
print(type(country_df_list))
# If we use a list to subset, we will always get a DataFrame object back.

In [None]:
# using square bracket notation
print(df['country'])

In [None]:
# using dot notation
print(df.country)

###1.3.2.1 Subset Rows by index Label - .loc[]

In [None]:
# get the first row
# python counts from 0
print(df.loc[0])

In [None]:
# get the 100th row
# python counts from 0
print(df.loc[99])

In [None]:
# get the last row (correctly)
# use the first value given from shape to getthe number of rows
number_of_rows = df.shape[0]
# subtract 1 from the value since we want the last index value
last_row_index = number_of_rows - 1
# finally do the subset using the index of the last row
print(df.loc[last_row_index])

In [None]:
# there are many ways of doing what you want
print(df.tail(n=1))

In [None]:
# get the last row of data in different ways
subset_loc = df.loc[0]
subset_head = df.head(n=1)

# type using loc of 1 row
print(type(subset_loc))

In [None]:
# type of using head of 1 row
print(type(subset_head))

###1.3.2.2 Subsetting Multiple Rows

In [None]:
print(df.loc[[0, 99, 999]])

In [None]:
# get the 2nd row
print(df.iloc[1])

In [None]:
## get the 100th row
print(df.iloc[99])

In [None]:
# using -1 to get the last row
print(df.iloc[-1])

In [None]:
## get the first, 100th, and 1000th row
print(df.iloc[[0, 99, 999]])

###1.3.4.1 Selecting Columns

In [None]:
# subset columns with loc
# note the position of the colon
# it is used to select all rows
subset = df.loc[:, ['year', 'pop']]
print(subset)

In [None]:
# subset columns with iloc
# iloc will allow us to use integers
# -1 will select the last column
subset = df.iloc[:, [2, 4, -1]]
print(subset)

Errors

In [None]:
# subset columns with loc
# but pass in integer values
# this will cause an error

#subset = df.loc[:, [2, 4, -1]]
#print(subset)

In [None]:
# subset columns with iloc
# but pass in index names
# this will cause an error

#subset = df.iloc[:, ['year', 'pop']]
#print(subset)

###1.3.4.2 Subsetting with range()

In [None]:
# create a range of integers from 0 - 4 inclusive
small_range = list(range(5))
print(small_range)

In [None]:
# subset the dataframe with the range
subset = df.iloc[:, small_range]
print(subset)

In [None]:
# create a range from 3 - 5 inclusive
small_range = list(range(3,6))
print(small_range)

In [None]:
subset = df.iloc[:, small_range]
print(subset)

In [None]:
# create a range from 0 - 5 inclusive, step =2
small_range = list(range(0,6,2))
print(small_range)

###1.3.4.3 Subsetting with Slicing :


In [None]:
print(df.columns)

In [None]:
small_range = list(range(3))
print(small_range)

In [None]:
subset = df.iloc[:,small_range]
print(subset)

In [None]:
# slice the first 3 columns
subset = df.iloc[:, :3]
print(subset)

In [None]:
small_range = list(range(3,6))
subset = df.iloc[:,small_range]
print(subset)

In [None]:
subset = df.iloc[:,list(range(0,6,2))]
print(subset)

In [None]:
# slice columns 3 to 5 inclusive
subset = df.iloc[:, 3:6]
print(subset)

In [None]:
small_range = list(range(0, 6, 2))
subset = df.iloc[:, small_range]
print(subset)

In [None]:
# slice every other columns
subset = df.iloc[:, 0:6:2]
print(subset)

In [None]:
print(df.iloc[:, ::2])

###1.3.5 Subsetting Rows and Columns

In [None]:
# using loc
print(df.loc[42, 'country'])

In [None]:
# using iloc
print(df.iloc[42, 0])

####1.3.5.1 Subsetting Multiple Rows and Columns

In [None]:
# get the 1st, 100th, and 1000th rows
# from the 1st, 4th, and 6th column
# note the columns we are hoping to get are:
# country, lifeExp, and gdpPercap
print(df.iloc[[0, 99, 999], [0, 3, 5]])

In [None]:
# if we use the column names directly,
# it makes the code a bit easier to read
# note now we have to use loc, instead of iloc
print(df.loc[[0, 99, 999], ['country',
'lifeExp', 'gdpPercap']])

##1.4 Grouped and Aggregated Calculations


### 1.4.1 Grouped Means

 There are several initial questions that we can ask ourselves:
* For each year in our data, what was the average life expectancy?
* Whatis the average life expectancy, population, and GDP?
* What if we stratify the data by continent and perform the same
calculations?
* How many countries are listed in each continent?




In [None]:
# For each year in our data, what was the average life expectancy?
# To answer this question, we need to:
# 1. split our data into parts by year
# 2. get the 'lifeExp' column
# 3. calculate the mean
print(df.groupby('year')['lifeExp'].mean())

In [None]:
# create grouped object by year
grouped_year_df = df.groupby('year')
print(type(grouped_year_df))

In [None]:
print(grouped_year_df)

In [None]:
grouped_year_df_lifeExp = grouped_year_df['lifeExp']
print(type(grouped_year_df_lifeExp))

In [None]:
print(grouped_year_df_lifeExp)

In [None]:
mean_lifeExp_by_year = grouped_year_df_lifeExp.mean()
print(mean_lifeExp_by_year)

In [None]:
# the backslash allows us to break up 1 long line of python code
# into multiple lines
df.groupby(['year', 'continent'])[['lifeExp','gdpPercap']].mean()
# is the same as
multi_group_var = df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].mean()
# look at the first 10 rows
print(multi_group_var)

### 1.4.2 Grouped Frequency Counts

In [None]:
# use the nunique (number unique)
# to calculate the number of unique values in a series
print(df.groupby('continent')['country'].nunique())

##1.5 Basic Plot

In [None]:
global_yearly_life_expectancy = df.groupby('year')['lifeExp'].mean()
print(global_yearly_life_expectancy)

In [None]:
# matplotlib is the default plotting library
# we need to import first
import matplotlib.pyplot as plt

# use the .plot() DataFrame method
global_yearly_life_expectancy.plot()
# show the plot
plt.show()