## Dictionary to DataFrame


In [None]:
# Pre-defined lists
names = ["United States", "Australia", "Japan", "India", "Russia", "Morocco", "Egypt"]
dr = [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Import pandas as pd
import pandas as pd

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {"country": names, "drives_right": dr, "cars_per_cap": cpc}

# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)

# Definition of row_labels
row_labels = ["US", "AUS", "JPN", "IN", "RU", "MOR", "EG"]

# Specify row labels of cars
cars.index = row_labels

# Print cars
print(cars)
print(cars.describe())

## CSV to DataFrame


In [2]:
# Import pandas as pd
import pandas as pd

# Fix import by including index_col
cars = pd.read_csv("datasets/cars.csv", index_col=0)
# Print out cars
print(cars)
print("----------------------------------------------")
print(cars.shape)
print("----------------------------------------------")
print(cars.info())
print("----------------------------------------------")
# Summary statistics for numerical data
print(cars.describe())
print("----------------------------------------------")
# Print the values of homelessness
print(cars.values)
print("----------------------------------------------")
# Know how many null values a column has
print(cars.isnull().sum())
print("----------------------------------------------")
# Print the column index of homelessness
print(cars.columns)
print("----------------------------------------------")
# Print the row index of homelessness
print(cars.index)
print("----------------------------------------------")

     cars_per_cap       country  drives_right
US            809  UnitedStates          True
AUS           731     Australia         False
JPN           588         Japan         False
IN             45         India         False
RU            200        Russia          True
MOR            70       Morocco          True
EG             45         Egypt          True
----------------------------------------------
(7, 3)
----------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, US to EG
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cars_per_cap  7 non-null      int64 
 1   country       7 non-null      object
 2   drives_right  7 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 175.0+ bytes
None
----------------------------------------------
       cars_per_cap
count      7.000000
mean     355.428571
std      341.377043
min       45.000000
25%     

## Col access using brackets


In [None]:
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

print(cars)
print("----------------------------------------------")
print(cars[["country", "drives_right"]])
print("----------------------------------------------")
print(cars[["country"]])
print("----------------------------------------------")

## Row access using brackets


In [None]:
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

print(cars)
print("----------------------------------------------")
print(cars[1:4])
print("----------------------------------------------")
print(cars.head(6))

## Rows & Cols access using loc & iloc

`[Subsetting DataFrame]`


In [None]:
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

print(cars)
print("----------------------------------------------")


# Access row by label
print(cars.loc[["RU"]])
print("----------------------------------------------")
print(cars.iloc[[4]])
print("----------------------------------------------")

# print as series
print(cars.loc["RU"])
print("----------------------------------------------")

# print as dataframe
print(cars.loc[["RU"]])
print("----------------------------------------------")


# Access multiple rows by label
print(cars.loc[["RU", "IN", "EG"]])
print("----------------------------------------------")
print(cars.iloc[[4, 3, 6]])
print("----------------------------------------------")


# Access row and column by label
print(cars.loc[["RU", "IN", "EG"], ["country", "drives_right"]])
print("----------------------------------------------")
print(cars.iloc[[4, 3, 6], [1, 2]])
print("----------------------------------------------")


# All rows, some columns
print(cars.loc[:, ["country", "drives_right"]])
print("----------------------------------------------")
print(cars.iloc[:, [1, 2]])
print("----------------------------------------------")


# specific cell
print(cars.loc[["RU"], ["country"]])
print("----------------------------------------------")

# Sub-dataframe
print(cars.loc[["RU", "IN", "EG"], ["country", "drives_right"]])
print("----------------------------------------------")

## Filtering from dataframe based on a condition


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

# Extract drives_right column as Series: dr
dr = cars["drives_right"]
print(dr)
print("----------------------------------------------")

# Use dr to subset cars: sel
sel = dr == True

# Print sel
print(cars[sel])

In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

# Create car_maniac: observations that have a cars_per_cap over 500
cpc = cars["cars_per_cap"]
many_cars = cpc > 500
car_maniac = cars[many_cars]

# Print car_maniac
print(cpc)

In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

# Import numpy, you'll need this
import numpy as np

# Create medium: observations with cars_per_cap between 100 and 500
cpc = cars["cars_per_cap"]
between = np.logical_and(cpc > 100, cpc < 500)
medium = cars[between]

# Print medium
print(medium)

## Iterate over a dataframe


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

# Iterate over rows of cars
for label, row in cars.iterrows():
    print(label)
    print(row)

In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)

# Adapt for loop
for lab, row in cars.iterrows():
    print(f"{lab}: {row['cars_per_cap']}")

# Add column


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)
print(cars)
print("----------------------------------------------")

cars["cpc per 1000"] = cars["cars_per_cap"] / 1000

# Print cars
print(cars)

## Add column using `loc`


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)
print(cars)
print("----------------------------------------------")
# Code for loop that adds COUNTRY column
for label, row in cars.iterrows():
    cars.loc[label, "COUNTRY"] = row["country"].upper()

# Print cars
print(cars)

## Add column using `apply()`


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)
print(cars)
print("----------------------------------------------")
# Use .apply(str.upper) beacuse .upper() is a method
cars["COUNTRY"] = cars["country"].apply(str.upper)
# use .apply(len) because len() is a function
cars["country length"] = cars["country"].apply(len)

print(cars)

## Sorting based on Col


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)
print(cars)
print("----------------------------------------------")
# sorting based on one column
cars_cpc = cars.sort_values("cars_per_cap", ascending=False)
print(cars_cpc)
print("----------------------------------------------")
# sorting based on multiple columns
cars_cpc_country = cars.sort_values(["cars_per_cap", "country"], ascending=[True, True])
print(cars_cpc_country)
print("----------------------------------------------")

## Subsetting rows by categorical variables with `isin()`


In [None]:
# Import cars data
import pandas as pd

cars = pd.read_csv("datasets/cars.csv", index_col=0)
print(cars)
print("----------------------------------------------")
countries = ["Japan", "Russia", "Egypt"]
conditions = cars["country"].isin(countries)
print(cars[conditions])
print("----------------------------------------------")

## Summary Statistics


In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)
print(sales.head())
print("----------------------------------------------")


# Print the mean of weekly_sales
print(sales["weekly_sales"].mean())
print("----------------------------------------------")

# Print the median of weekly_sales
print(sales["weekly_sales"].median())
print("----------------------------------------------")

# Print the maximum of the date column
print(sales['date'].max())
print("----------------------------------------------")

# Print the minimum of the date column
print(sales['date'].min())
print("----------------------------------------------")


## Use `agg()`

In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)
print(sales.head())
print("----------------------------------------------")

# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the temperature_c column
print(sales['temperature_c'].agg(iqr))
print("----------------------------------------------")

# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
print("----------------------------------------------")

# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))


## Droping Duplicate `drop_duplicates(subset = ['',''])`

In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)

# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset = ['store','type'])
print(store_types.head())
print("----------------------------------------------") 

# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset = ['store','department'])
print(store_depts.head())
print("----------------------------------------------") 

# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales['is_holiday'] == True].drop_duplicates('date')

# Print date col of holiday_dates
print(holiday_dates['date'])

## Counting categorical variables `value_counts(sort = True)`
## Proportion `value_counts(normalize = True)`

In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)

# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset = ['store','type'])

# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset = ['store','department'])

# Count the number of stores of each type
store_counts = store_types['type'].value_counts()
print(store_counts)

# Get the proportion of stores of each type
store_props = store_types['type'].value_counts(normalize = True)
print(store_props)

# Count the number of each department number and sort
dept_counts_sorted = store_depts['department'].value_counts(sort = True)
print(dept_counts_sorted)

# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts['department'].value_counts(sort=True, normalize=True)
print(dept_props_sorted)



## Grouping using `grouby()`

In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)

# Group by type; calc total weekly sales
sales_by_type = sales.groupby("type")["weekly_sales"].sum()


# Get proportion for each type
sales_propn_by_type = sales_by_type / sum(sales_by_type)
print(sales_propn_by_type)
print("----------------------------------------------")

# Group by type and is_holiday; calc total weekly sales
sales_by_type_is_holiday = sales.groupby(["type","is_holiday"])["weekly_sales"].sum()
print(sales_by_type_is_holiday)
print("----------------------------------------------")

# Group by type and is_holiday; calc max, min, mean & median weekly sales
sales_by_type_is_holiday_stat = sales.groupby(["type","is_holiday"])["weekly_sales"].agg(['max', 'min', 'mean', 'median'])
print(sales_by_type_is_holiday_stat)
print("----------------------------------------------")


## Pivot Tables

In [None]:
import pandas as pd

sales = pd.read_csv("datasets/sales.csv", index_col=0)

# Pivot for mean weekly_sales for each store type
mean_sales_by_type = sales.pivot_table(values = 'weekly_sales', index = 'type')

# Print mean_sales_by_type
print(mean_sales_by_type)
print("----------------------------------------------")

# Pivot for mean and median weekly_sales for each store type
# Pivot for mean and median weekly_sales for each store type
mean_med_sales_by_type = sales.pivot_table(values = 'weekly_sales', index = 'type', aggfunc = ['mean','median'])

# Print mean_med_sales_by_type
print(mean_med_sales_by_type)
print("----------------------------------------------")

# Pivot for mean weekly_sales by store type and holiday 
mean_sales_by_type_holiday = sales.pivot_table(values = 'weekly_sales', index = 'type', columns = 'is_holiday', aggfunc = 'mean')

# Print mean_sales_by_type_holiday
print(mean_sales_by_type_holiday)
print("----------------------------------------------")


# Print mean weekly_sales by department and type; fill missing values with 0
print(sales.pivot_table(values='weekly_sales',index='type', columns='department', aggfunc = 'mean', fill_value = 0))
print("----------------------------------------------")

# Print the mean weekly_sales by department and type; fill missing values with 0s; sum all rows and cols
print(sales.pivot_table(values='weekly_sales',index='type', columns='department', aggfunc = 'mean', fill_value = 0, margins=True))
print("----------------------------------------------")