# Using Python for Data Analysis

In [None]:
import pandas as pd
import seaborn as sns

## Pandas Overview

Pandas is a widely-used library for data analyis. We'll be using a small subset of its features for this talk.

In [None]:
# Loading in a CSV
floods = pd.read_csv('https://raw.githubusercontent.com/ajduberstein/dartmouth_flood_data/master/floods.csv')
sfo = pd.read_csv('https://raw.githubusercontent.com/ajduberstein/sf_public_data/master/Air_Traffic_Passenger_Statistics.csv')

In [None]:
# See the first 5 rows
floods.head()

In [None]:
# See the last 5 rows in any data set
floods.tail()

In [None]:
# Get summary statistics
floods.describe()

In [None]:
# Get a histogram for a single column
floods['area'].hist()

In [None]:
# You're also in Python, so you can call in other Python functions
import math
# Apply a log scale to the histogram

# In case logs are murky:
# This is essentially counting the number of digits - 1
# math.log10(100) == 2
# math.log10(1000) == 3
# etc

floods['area'].apply(math.log10).hist()

In [None]:
# Aggregation
floods.count()

In [None]:
# Aggregate by group
floods.groupby('main_cause').count().head()

In [None]:
# Sorting and chaining functions - this would give us the top 5 causes of floods by frequency
floods.groupby('main_cause')\
    .count()\
    .sort_values('id', ascending=False)\
    .head(10)

In [None]:
floods['log10_displaced'] = floods['displaced'].apply(lambda x: math.log10(x) if x > 0 else 0)

## Charts

### relplots

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(rc={'figure.figsize':(24.7, 8.6)})
plt.figure(figsize=(45,10))

In [None]:
# Correlations in Seaborn
sns.relplot(x='log10_displaced', y='area', data=floods, aspect=3)

In [None]:
sns.relplot(x='lng', y='lat', alpha=0.1, data=floods, aspect=2)

In [None]:
# Create your own function to recode the data

def recode_cause(cause):
    cause = str(cause).lower()
    if 'monsoon' in cause:
        return 'MONSOON'
    elif 'rain' in cause:
        return 'RAIN'
    elif 'melt' in cause:
        return 'SNOWMELT'
    elif 'tropical storm' in cause:
        return 'TROPICAL STORM'
    else:
        return 'OTHER'


floods['cause_recoded'] = floods['main_cause'].apply(recode_cause)
floods.head()

In [None]:
# Show relationship between two variables
sns.relplot(x='lng', y='lat', alpha=0.2, data=floods, hue='cause_recoded', aspect=3)

In [None]:
import pydeck

# Same data on an interactive map
color_lookup = pydeck.data_utils.assign_random_colors(floods['cause_recoded'])
floods['rgb'] = floods['cause_recoded'].apply(lambda x: color_lookup[x])

scatter = pydeck.Layer(
    'ScatterplotLayer',
    data=floods,
    get_position='[lng, lat]',
    get_radius='30000 * severity',
    get_fill_color='rgb',
    pickable=True
)
pydeck.Deck(layers=[scatter], tooltip=True).show()

In [None]:
# Correlation between two variables, again
corr = sns.relplot(x='displaced', y='area', data=floods, hue='cause_recoded', alpha=0.25, aspect=3)

### Barcharts

In [None]:
import matplotlib.pyplot as plt

# Relative comparisons
sp = sns.barplot(
    x='cause_recoded',
    y='log10_displaced',
    data=floods)

sns.set(font_scale=2)
sp.set_xticklabels(sp.get_xticklabels(), rotation=30)
sp.set(
    xlabel='Cause',
    ylabel='Log of # of People Displaced',
    title='Relative Distributions of Flood Causes')

In [None]:
sfo['datetime'] = sfo['Activity Period'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:] + '-01')

In [None]:
# Time series
sns.set(style="whitegrid")
df = sfo.groupby(['datetime']).sum()['Passenger Count']
df = df.reset_index()
sp = sns.lineplot(
    x='datetime',
    y='Passenger Count',
    data=df,
    linewidth=2)
labels = [x if x.endswith('-12-01') or x.endswith('-06-01') else '' for x in df['datetime']]
sp.set_xticklabels(labels, rotation=30)
sp

In [None]:
import numpy as np
df = sfo.groupby(['datetime', 'Price Category Code']).sum()['Passenger Count']
df = df.reset_index()
pivoted = pd.pivot_table(
    data=df,
    index='datetime',
    columns='Price Category Code',
    values='Passenger Count',
    aggfunc=np.sum)
c = sns.lineplot(data=pivoted, palette="tab10", hue='Price Category Code', linewidth=2.5)
c.set_xticklabels(c.get_xticklabels(), rotation=30)
c

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
# From seaborn docs
sns.set(style="whitegrid")

rs = np.random.RandomState(365)
values = rs.randn(365, 4).cumsum(axis=0)
dates = pd.date_range("1 1 2016", periods=365, freq="D")
data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"])
data = data.rolling(7).mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Heatmap from Seaborn docs

# Load the example flights dataset and convert to long-form
flights_long = sns.load_dataset("flights")
flights = flights_long.pivot("month", "year", "passengers")

# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(flights, annot=True, fmt="d", linewidths=.5, ax=ax)