In [None]:
import pandas as pd
import numpy as np
path=""

In [None]:
homelessness=pd.read_csv(path+"homelessness.csv")

In [None]:
homelessness.head()

In [None]:
# Print information about homelessness
print(homelessness.info())

In [None]:
# Print the shape of homelessness
print(homelessness.shape)

In [None]:
# Print a description of homelessness
print(homelessness.describe())

In [None]:
# Print the values of homelessness
print(homelessness.values)

In [None]:
# Print the column index of homelessness
print(homelessness.columns)

In [None]:
# Print the row index of homelessness
print(homelessness.index)

# Sorting and subsetting

In [None]:
# Sort homelessness by individual
homelessness_ind = homelessness.sort_values("individuals")

# Print the top few rows
homelessness_ind.head()

In [None]:
# Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members",ascending=False)

# Print the top few rows
homelessness_fam.head()

In [None]:
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region","family_members"],ascending=[True,False])

# Print the top few rows
homelessness_reg_fam.head()

Subsetting columns

In [None]:
# Select the individuals column
individuals = homelessness["individuals"]

# Print the head of the result
individuals.head()

In [None]:
# Select the state and family_members columns
state_fam = homelessness[["state","family_members"]]

# Print the head of the result
state_fam.head()

In [None]:
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"]>10000]

# See the result
ind_gt_10k.head()

In [None]:
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"]=="Mountain"]

# See the result
mountain_reg.head()

In [None]:
# Filter for rows where family_members is less than 1000 
# and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"]<1000) & (homelessness["region"]=="Pacific")]

# See the result
fam_lt_1k_pac.head()

In [None]:
# Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic =  homelessness[(homelessness["region"]=="South Atlantic") | (homelessness["region"]=="Mid-Atlantic")]

# See the result
south_mid_atlantic.head()

In [None]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]

# See the result
mojave_homelessness.head()

# Adding new columns

In [None]:
# Add total col as sum of individuals and family_members
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]

# Add p_individuals col as proportion of individuals
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]

# See the result
homelessness.head()

In [None]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"] 

# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]

# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]

# See the result
result.head()

# Summary statistics

In [None]:
# this data set contains weekly sales in US dollars in various stores
# each store has id number and specific store type
# unemp is national employement rate that week
sales = pd.read_csv(path+"sales_subset.csv")
sales.head()

In [None]:
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean())

In [None]:
# Print the median of weekly_sales
print(sales["weekly_sales"].median())

In [None]:
# Print the maximum of the date column
print(sales['date'].max())

# Print the minimum of the date column
print(sales['date'].min())

In [None]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the temperature_c column
print(sales["temperature_c"].agg(iqr))

In [None]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))

In [None]:
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))

In [None]:
sales_1_1 = sales[(sales["store"]==1) & (sales["department"]==1)]
sales_1_1['date']= pd.to_datetime(sales_1_1['date'])
sales_1_1.head()

In [None]:
# Sort sales_1_1 by date
sales_1_1 = sales_1_1.sort_values("date")

# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()

# Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()

# See the columns you calculated
sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]].head()

# Dropping duplicates

In [None]:
# Drop duplicate store/type combinations
sales['date']=pd.to_datetime(sales['date'])
store_types = sales.drop_duplicates(subset=["store", "type"])
store_types.head()

In [None]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store", "department"])
store_depts.head()

In [None]:
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date")

# Print date col of holiday_dates
holiday_dates["date"]

In [None]:
# Count the number of stores of each type
store_counts = store_types["type"].value_counts()
print(store_counts)

In [None]:
# Get the proportion of stores of each type
store_props = store_types["type"].value_counts(normalize=True)
print(store_props)


In [None]:
# Count the number of each department number and sort
dept_counts_sorted = store_depts["department"].value_counts()
print(dept_counts_sorted)



In [None]:
# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize=True)
print(dept_props_sorted)

# Groupping

In [None]:
# Group by type; calc total weekly sales
sales_by_type = sales.groupby("type")["weekly_sales"].sum()

sales_by_type.head()

In [None]:
# Get proportion for each type
sales_propn_by_type = sales_by_type / sum(sales_by_type)
print(sales_propn_by_type)

In [None]:
sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].sum()
sales_by_type_is_holiday.head()

In [None]:
sales.head()

In [None]:
# Import numpy with the alias np
import numpy as np

# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])

# Print sales_stats
sales_stats.head()

In [None]:
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])

# Print unemp_fuel_stats
unemp_fuel_stats.head()

# Pivot Tables

In [None]:
# Pivot for mean weekly_sales for each store type
mean_sales_by_type = sales.pivot_table(values="weekly_sales",index="type")

# Print mean_sales_by_type
mean_sales_by_type.head()

In [None]:
# Pivot for mean and median weekly_sales for each store type
mean_med_sales_by_type = sales.pivot_table(values="weekly_sales",index="type",aggfunc=[np.mean,np.median])

# Print mean_med_sales_by_type
mean_med_sales_by_type.head()

In [None]:
# Pivot for mean weekly_sales by store type and holiday 
mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales",index="type",columns="is_holiday")

# Print mean_sales_by_type_holiday
mean_sales_by_type_holiday.head()

In [None]:
# Print mean weekly_sales by department and type; fill missing values with 0
sales_data=sales.pivot_table(values="weekly_sales", index="department", columns="type",fill_value=0, margins=True)
sales_data.head()

# Indexes

In [None]:
temperatures = pd.read_csv(path+"temperatures.csv")
temperatures.head()

In [None]:
temperatures_ind = temperatures.set_index("city")
temperatures_ind.head()

In [None]:
temperatures_ind.reset_index()

In [None]:
temperatures_ind.reset_index(drop=True)

In [None]:
# Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]

# Subset temperatures using square brackets
temperatures[temperatures["city"].isin(cities)]

In [None]:
# Subset temperatures_ind using .loc[]
temperatures_ind.loc[cities]

In [None]:
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country","city"])

# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil","Rio De Janeiro"),("Pakistan","Lahore")]

# Subset for rows to keep
temperatures_ind.loc[rows_to_keep]

In [None]:
# Sort temperatures_ind by index values
temperatures_ind.sort_index()

In [None]:
# Sort temperatures_ind by index values at the city level
temperatures_ind.sort_index(level="city")

In [None]:
# Sort temperatures_ind by country then descending city
temperatures_ind.sort_index(level=["country", "city"], ascending = [True, False])

# Slicing

In [None]:
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()
temperatures_srt.head()

In [None]:
# Subset rows from Pakistan to Russia
temperatures_srt.loc["Pakistan":"Russia"]

In [None]:
# Try to subset rows from Lahore to Moscow
temperatures_srt.loc["Lahore":"Moscow"]

In [None]:
# Subset rows from Pakistan, Lahore to Russia, Moscow
temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")]

In [None]:
temperatures_srt.loc[("India","Hyderabad"):("Iraq","Baghdad"),"date":"avg_temp_c"]

In [None]:
temperatures['date']= pd.to_datetime(temperatures['date'])
temperatures.head()

In [None]:
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temperatures["date"] <= "2011-12-31")]
temperatures_bool.head()

In [None]:
# Set date as the index and sort the index
temperatures_ind = temperatures.set_index("date").sort_index()

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
temperatures_ind.loc["2010":"2011"]

In [None]:
# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
temperatures_ind.loc["2010-08":"2011-02"]

In [None]:
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22, 1])

In [None]:
# Use slicing to get the first 5 rows
temperatures.iloc[:5]

In [None]:
# Use slicing to get columns 3 to 4
temperatures.iloc[:, 2:4]

In [None]:
# Use slicing in both directions at once
temperatures.iloc[:5, 2:4]

In [None]:
# Add a year column to temperatures
temperatures["year"] = temperatures["date"].dt.year

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = ["country", "city"], columns = "year")

# See the result
temp_by_country_city_vs_year.head()

In [None]:
# Subset for Egypt to India
temp_by_country_city_vs_year.loc["Egypt":"India"]

In [None]:
# Subset for Egypt, Cairo to India, Delhi
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")]

In [None]:
# Subset in both directions at once
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2010"]

In [None]:
temp_by_country_city_vs_year.head()

In [None]:
# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()
mean_temp_by_year.head()

In [None]:
# Filter for the year that had the highest mean temp
mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]

In [None]:
# Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
mean_temp_by_city.head()

In [None]:
# Filter for the city that had the lowest mean temp
mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()]