<a href="https://colab.research.google.com/github/asantucci/Python-Workshop/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install --upgrade plotly

# Covid-19 Analysis !

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv",parse_dates=[0])
print(df)

In [None]:
cases_states = df.pivot(index='date',columns='state',values='cases')
cases_states = cases_states.fillna(0)
print(cases_states)
cases_states.plot(y=['California','New York','Florida'])

In [None]:
daily_cases_states = cases_states.diff()
daily_cases_states = daily_cases_states.fillna(0)
print(daily_cases_states)
daily_cases_states.plot(y=['California','New York','Florida'])

# Dataframes

In [None]:
# Build a DF from a dictionnary
data = {
    'Name':['Leo', 'Bob', 'John'],
    'WakeupTime':[pd.Timestamp('07:00:00'), pd.Timestamp('08:30:00'), pd.Timestamp('07:30:00')],
    'GPA':np.arange(1.0, 4.0),
    'School':'Stanford',
    'Siblings':np.array([1, 2, 0]),    
}
df = pd.DataFrame(data) 
df

In [None]:
# Make a column the index
df2 = df.set_index('Name')
df2

In [None]:
# From CSV (the most useful !). Even using URL !
df = pd.read_csv("https://web.stanford.edu/~lcambier/pc/names.csv")
df

### Exercice

In [None]:
# Build the dataframe where *names are the indices*
# and email & age are the columns. Adjust column names accordingly.

names  = ['Leo', 'Bob', 'Jess', 'Casey', 'John', 'Cherr']
emails = ['lc@comp.com', 'bob@stanford.edu', 'j@e.ss', 'casey@my.me', 'john@deer.us', 'cherr@y.net']
ages   = np.random.randint(1, 30, 6)

In [None]:
#@title Solution
df = pd.DataFrame({'Email':emails, 'Age':ages}, index=names)
df

### Analyzing data

In [None]:
# Create some data
months = pd.date_range(start='20190101', periods=12, freq='M')
change = np.random.normal(0,1.2,(12, 3))
stocks = ['GOOG', 'TSLA', 'APPL']
df = pd.DataFrame(change, index=months, columns=stocks)
df

Quick glance at data

In [None]:
print(df.head(3), '\n')
print(df.tail(2), '\n')
print(df.describe(), '\n')

Selecting data

In [None]:
## Selection using labels

# One columns
print(df['GOOG'], '\n')

# A slice of rows
print(df[2:5], '\n')

# Multiple rows & columns
# Endpoints INCLUDED, unlike in regular Python slicing syntax
print(df.loc['2019-07-31':'2019-09-30',['TSLA','GOOG']], '\n')

In [None]:
## Selection using conditions

print(df, '\n')
print(df.loc[df['GOOG'] > 2.5,:], '\n')         # Some rows
print(df.loc[df.index >= '2019-08-15',:], '\n') # Some rows
print(df[df > 0.5], '\n')                       # All data

## Groupby

In [None]:
data = {'Name': ['Tom\'s Pizza', 'Leo\'s Taqueria', 'John\'s Burgers', 'Cindy\'s Peluqueria', 'Sergio\'s Tacos', 'Bazyli\'s Pub'],
        'Location':['NYC','SF','WDC','SF','SF','NYC'],
        'Num Customers':[5, 3, 8, 4, 6, 8],
        'Revenue':[32.6, 54.6, 43.8, 43.6, 32.6, 97.5]}
df = pd.DataFrame(data)
df

In [None]:
groups = df.groupby('Location')
for n,g in groups:
    print("-------\nGroup {}\n".format(n))
    print(g.mean())
df2 = groups.mean()
# df2 = df.groupby('Location').max()
print(df2)


## Pivot

In [None]:
df = pd.DataFrame({'date':['2020-01-01','2020-01-01','2020-02-01','2020-02-01'],
                   'crypto':['BTC','ETH','ETH','BTC'],
                   'price':['8192','350','405','9510'],
                   'exchange':['Coinbase','Bitconnect','Bitconnect','Bitconnect']})
df

In [None]:
df2 = df.pivot(index='date',columns='crypto',values='price')
df2

In [None]:
df = pd.read_csv('https://web.stanford.edu/~lcambier/pc/stocks.csv',parse_dates=[1])
df

In [None]:
df2 = df.pivot(index='Date',columns='Stock',values='Open')
df2.plot(y=['APPL','SBUX'])
plt.show()

# Fancy Plotting !

In [None]:
import pandas as pd
import numpy as np
import urllib.request
import plotly.express as px
import json

with urllib.request.urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv', dtype={"fips": str, "state": str, "county": str})
df.loc[df['county'] == "New York City",'fips'] = "36061"
df = df.groupby(['fips', 'state', 'county']).sum()
df = df.reset_index()
df['Deaths (log10)'] = np.log10(df['deaths'])

fig = px.choropleth(df, locations='fips',
                        color='Deaths (log10)',
                        scope='usa',
                        geojson=counties,
                        hover_data=['deaths'])
fig.show()

# Exercice: bikes in Montreal

Link to data: https://stanford.edu/~lcambier/pc/bikes.csv

In [None]:
# (1)
# TODO: Properly read bikes.csv. 
# - Use ; as a separator
# - Parse the column 'Date' as dates and note that the day comes first in the CSV ("non-US" way)
# Checkout https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
# Yes it's a little scary, it would not be Pandas otherwise :-)
# Search "dayfirst" and you'll find the option
# Print the first 15 and check what you just read

In [None]:
#@title Solution
df = pd.read_csv('https://stanford.edu/~lcambier/pc/bikes.csv',sep=';',parse_dates=["Date"],dayfirst=True)
df = df.set_index('Date')
print(df.head(15))

In [None]:
# (2)
# TODO: Plot the number of bikes in "du Parc" as a function of time
# Label the axes and put a title

In [None]:
#@title Solution
df.plot(y='du Parc')
plt.xlabel('Date')
plt.ylabel('Number of bikes')
plt.title('Du Parc\'s bikes');

In [None]:
# (3)
# TODO (bonus): Can you try to smooth out the curve ? 
# Use a 1-week moving average.
# You'll have to google that

In [None]:
#@title Solution
df.rolling(7).mean().plot(y=['du Parc'])

In [None]:
# (4) 
# TODO (bonus 2)
#  1. Create a column holding the weekday
#     df.index.weekday will give you that column.
#  2. Sum all cyclists in each neighborhood for each week day
#     Make a bar plot of the cyclists/day for each neighborhood
#  3. Sum all neighborhood and make a pie chart of the total number
#     of cyclists/day
# Hint: 
# - df.index.weekday returns the weekday for each date in the index
# - df.sum(axis=...) sums accross rows (axis = 0) or columns (axis = 1)
# - df.plot(kind=...) can do bar plots (kind = bar) or pie (kind = pie)

In [None]:
#@title Solution
df['Weekday'] = df.index.weekday
df_per_week = df.groupby('Weekday').sum()
print(df_per_week)
df_per_week.plot(kind='bar')
plt.title('Cyclists per day of the week per neighborhood')
plt.show()

In [None]:
#@title Solution
df_per_week_all = df_per_week.sum(axis=1)
df_per_week_all.plot(kind='pie')
plt.title('Cyclists per day of the week')
plt.show()

# Extra Exercice: 311 Customer complaints

Link to data: https://stanford.edu/~lcambier/pc/311.csv

## Read the data

In [None]:
# TODO: Read the data, get a sense of what's in it by displaying some rows, printing columns names, etc

In [None]:
#@title Solution
df = pd.read_csv('https://stanford.edu/~lcambier/pc/311.csv')
print(df.head(5))
print(df.describe())
print(df.columns)

## Most common complaints

In [None]:
# TODO: Find the 10 most common complaint and visualize the distribution of complaints
# Hint:
# - df[column].value_counts() can count the number of occurences of entries in a column

In [None]:
#@title Solution
ct = df['Complaint Type'].value_counts()
print(ct.head(10))
ct.plot(kind='pie')

## Plumbing complaints per borough

In [None]:
# TODO: Find the borough with the most PLUMBING complaints

In [None]:
#@title Solution
df_p = df[df['Complaint Type'] == 'PLUMBING']
df_p_vc = df_p['Borough'].value_counts()
df_p_vc.plot(kind='bar')

## Time of complaint

In [None]:
# (1) 'Created Date' is a string in df.
# Convert it to a proper DatetimeIndex
# and keep the hour only
# Tip: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html

# (2) Count each occurance and plot the distribution


In [None]:
#@title Solution
hour = pd.DatetimeIndex(df['Created Date']).hour

hour_count = hour.value_counts()
hour_count.plot(kind='bar')

## Harder: Analyse complaints geographical distribution

In [None]:
# TODO: Plot the position of the complaints
# Tip: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html

In [None]:
#@title Solution
df.plot(kind='scatter',x='Longitude', y='Latitude')
plt.show()

In [None]:
# (2) TODO: Plot the position of the complaints registered in Manhattan only

# (3) Round the longitude and latitude to the nearest 0.01
# Tip: np.around should be helpful

# (4) Group data by (latitude, longitude), count the size of each group
# and aggregate

# (5) Plot the (longitude, latitude, size) on a xy scatter plot


In [None]:
#@title Solution (2)
# (2)
man = df['Borough'] == 'MANHATTAN' 
df[man].plot(kind='scatter',x='Longitude', y='Latitude')
plt.show()

In [None]:
#@title Solution (3-4)
df.loc[man,'Longitude'] = np.around(df.loc[man,'Longitude'], 2)
df.loc[man,'Latitude'] = np.around(df.loc[man,'Latitude'], 2)
s = df.loc[man,:].groupby(['Latitude','Longitude']).size()
print(s.head(10))

In [None]:
#@title Solution (5)
ids = s.index
plt.figure()
plt.scatter(x=ids.levels[0][ids.codes[0]], y=ids.levels[1][ids.codes[1]], s=3*s.values)
plt.show()