# Introduction to Pandas and Matplotlib

---

In [None]:
import pandas as pd # Pandas = 'Python Data Analysis Library' (https://pandas.pydata.org/)
import matplotlib  # A 2D plotting library (https://matplotlib.org/)
import matplotlib.pyplot as plt
import seaborn as sns # Statistical data visualization (https://seaborn.pydata.org/)

----

## Let's get started with Data Frames, a table structure of rows and columns used in Pandas

In [None]:
# We begin by creating a new data frame using pd.DataFrame
# We are then going to use a 'list of dictionaries', which we'll touch on briefly

df = pd.DataFrame([ 
    {"First Name": "Alex", "Last Name": "Siegman"}, 
    {"First Name": "John", "Last Name": "Doe"}
])

In [None]:
df

### Let's dig a bit deeper to understand what just happened

In [None]:
dictionary = {"First Name": "Alex", "Last Name": "Siegman"}
               # <key>     <value>     <key>      <value>

In [None]:
dictionary['First Name']

In [None]:
dictionary['Last Name']

In [None]:
df

In [None]:
df[0] # note that this will give us an error

In [None]:
df['First Name'] # this is how we get what we need

# So that's all a data frame is, it's a table of rows and columns! 

---

# Now, let's begin to delve further into Pandas with a different data set. 

In [None]:
!rm -f restaurant.csv* # 'rm' = 'remove'
                            # '-f' means 'force', aka, it will bypass permission checks
                            # 'data/restaurant.csv*' means we want to remove any file in our data directory that ends with 'resturant.csv'
                            # in total, this command removes any prior file, if it exists

!curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurant.csv
                            # 'curl' is a tool to transfer eata from or to a server
                            # for more on 'curl' visit (https://curl.haxx.se/docs/manpage.html)

# !gzip data/restaurant.csv # compress the file

# Now that we have our data...

In [None]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [None]:
restaurants = pd.read_csv('./restaurant.csv', 
                         encoding = 'utf-8', # for more on UTF-8 check (https://www.w3schools.com/charsets/ref_html_utf8.asp)
                         dtype = 'unicode', # we are telling Pandas to read our data as data type object 'Unicode' which will make it a string
                         parse_dates = True, # parse our dates that are coming in as strings, as specified above
                         infer_datetime_format = True, # we are asking Pandas to infer the format of the datetime strings in the column so as to increase parsing speed
                         low_memory = False) # normally Pandas will try to automatically detrmine the dtype, which takes lots of memory

---

# For column definitions let's check out [the documentation](https://data.cityofnewyork.us/api/views/43nn-pn8j)

---

In [None]:
restaurants.head() # show us the first five rows of our data frame

In [None]:
restaurants.tail() # show us the last five rows of our data frame

# Now that we have successfully read our CSV, let's look at some basics

In [None]:
restaurants.describe()

In [None]:
restaurants.info()

# Note that above all of our data is stored as a non-null object, aka, a string.  But 'Score' is not a string, it's a numerical value. So let's work to alter that in our data frame.

In [None]:
restaurants["SCORE"] = pd.to_numeric(restaurants["SCORE"])
# we are setting the value of our column equal to itself, but now with the caveat that we want it converted 'to numeric'

restaurants.dtypes

In [None]:
restaurants.SCORE.describe() # let's get some more info on our "SCORE" column...

In [None]:
restaurants["SCORE"].hist()

In [None]:
restaurants["SCORE"].hist(bins=50) # let's change the default number of bins

# Let's take a moment to explore what else we can customize in our histogram.

In [None]:
restaurants["SCORE"].hist(bins=50, # use 50 bins
                      range=(0,50), # our x-axis will range from 0 to 50
                      density=False, # show the raw count; to show normalized count use (density=True)
                      figsize=(15,5), # control the size of the plot
                      alpha = 0.8 # make plot 20% transparent
                      )

# We can also use KDE (kernel density estimation) to estimate a continuous function, instead of bucketized as above.

In [None]:
restaurants["SCORE"].plot(
    kind = 'kde',
    color = 'Black', 
    xlim = (0, 50),
    figsize = (15, 5)
)

---

# Now let's do some work with dates.

In [None]:
restaurants["GRADE DATE"].head(10) # show us the first ten values of this column

# Note that our dates are stored as strings, which doesn't really help us. So, we can convert all of our dates using the 'to_datetime' function, and format them as illustrated below.

In [None]:
# %m Month as a zero-padded decimal number
# %d Day of month as a zero-padded decimal number
# %y Year with century as a decimal number

restaurants["GRADE DATE"] = pd.to_datetime(restaurants["GRADE DATE"], format="%m/%d/%Y")
restaurants["RECORD DATE"] = pd.to_datetime(restaurants["RECORD DATE"], format="%m/%d/%Y")
restaurants["INSPECTION DATE"] = pd.to_datetime(restaurants["INSPECTION DATE"], format="%m/%d/%Y")

In [None]:
restaurants.dtypes # let's check to make sure our code above worked...

In [None]:
restaurants[["INSPECTION DATE","GRADE DATE","RECORD DATE"]].describe()

---

# Exercise 1: Plot a histogram of our dates

In [None]:
# your code here

# Solution 1:

In [None]:
restaurants["INSPECTION DATE"].hist()

---

# Exericse 2: Change the number of bins in our histogram

In [None]:
# your code here

# Solution 2: 

In [None]:
restaurants["INSPECTION DATE"].hist(bins=60)

---

# Exercise 3: Focus on the dates 1/1/2014 thru 05/31/2018

In [None]:
# your code here

# Solution 3:

In [None]:
restaurants["INSPECTION DATE"].hist(
    range = ('1/1/2014', '9/30/2018'),
    bins = 57, # number of months in the range, computed manually
    figsize = (15,5)
)

---

# Now that we've worked with our dates, let's look at categorical values.

# Sometimes we need categorical values, when we have a variable that has an implicit order, for instance an 'ABC' grade (as we do in our restaurants data set).

In [None]:
restaurants["BORO"] =  pd.Categorical(restaurants["BORO"], ordered=False) 
restaurants["GRADE"] =  pd.Categorical(restaurants["GRADE"], categories = ['A', 'B', 'C'], ordered=True)
# 'ordered=True' means that we are saying there are three categories, and 'A' > 'B' > 'C', in that order
restaurants["VIOLATION CODE"] =  pd.Categorical(restaurants["VIOLATION CODE"], ordered=False)
restaurants["CRITICAL FLAG"] =  pd.Categorical(restaurants["CRITICAL FLAG"], ordered=False)
restaurants["ACTION"] =  pd.Categorical(restaurants["ACTION"], ordered=False)
restaurants["CUISINE DESCRIPTION"] =  pd.Categorical(restaurants["CUISINE DESCRIPTION"], ordered=False)

restaurants.dtypes

# Let's delve into a particular column, 'CUISINE DESCRIPTION'

In [None]:
restaurants["CUISINE DESCRIPTION"].value_counts()[:5] # give us the 'value_counts' of the first five columns

In [None]:
restaurants["CUISINE DESCRIPTION"].value_counts()[:5].plot(kind='bar')

# ^ That is super ugly. Let's shorten the name of 'Latin (Cuban, Dominican...' because it is messing up our formatting.

In [None]:
restaurants["CUISINE DESCRIPTION"].replace(
    to_replace='Latin (Cuban, Dominican, Puerto Rican, South & Central American)', # replace this...
    value = 'Latin American', # with this
    inplace=True # inplace=True means we change direclty the dataframe instead of returning a ndw df qithout the deleted value
)

# While we're at it, let's fix the formattig in 'Cafe/Coffee/Tea'

In [None]:
restaurants["CUISINE DESCRIPTION"].replace(
    to_replace='CafÃ©/Coffee/Tea', # replace this
    value = 'Cafe/Coffee/Tea', # with this
    inplace=True
)

In [None]:
popular = restaurants["CUISINE DESCRIPTION"].value_counts()
popular[:5].plot(kind='bar')

---

# Exercise 4: What are the 10 most common violation codes? 

In [None]:
# your code here

# Solution 4: 

In [None]:
violation_counts = restaurants["VIOLATION CODE"].value_counts(); 

violation_counts[0:10]

---

# Exercise 5: Plot the 10 most common violation codes as a bar chart

In [None]:
# your code here

# Solution 5: 

In [None]:
violation_counts[0:20].plot(kind='bar')

---

# Exercise 6: Plot the numer of inspections across each bourough

In [None]:
# your code here

# Solution 6:

In [None]:
restaurants["BORO"].value_counts().plot(kind='barh') # the 'h' makes it horizontal

---

# Imagine we want to get a subset of our data frame based on the columns we're interested in.

In [None]:
columns = ["GRADE DATE","VIOLATION CODE","DBA","SCORE"] # create a list of the columns we're interested in 
restaurants[columns].head(10)

# What if instead we wanted to select the rows we're interested in? Well, to do that, we can generate a list of boolean (True or Fale) values, one for each row of our Data Frame, then use a list to see which rows to keep. 

# In this case, '04L' is the code for 'has mice'.


In [None]:
mice = restaurants["VIOLATION CODE"] == "04L"

mice

In [None]:
has_mice = restaurants[mice] # let's apply this new condition to our original df ('restaurants') and store the result 
                             # in a new data frame called 'has_mice'

has_mice

---

# Exercise 7: Which restaurants have the most mice complaints? 

In [None]:
# your code here

# Solution 7: 

In [None]:
has_mice["DBA"].value_counts()[:20] # "DBA" represents the name 'Doing Business As" of the entity'

---

# Exercise 8: Let's pull up all of Subway's mice complaints

In [None]:
# your code here

# Solution 8:

In [None]:
has_mice.loc[has_mice["DBA"]=="SUBWAY"]

---

# Now let's do some work with Pivot Tables. First, let's count the number of restaurants inspected every day.

In [None]:
pivot = pd.pivot_table(
    data=restaurants,
    index = "INSPECTION DATE", # specifies rows
    values = "CAMIS", # specifies content of cells
    aggfunc = "count" # ask to count how many different CAMIS values we see
)

In [None]:
pivot

In [None]:
pivot.plot() 

In [None]:
pivot.head()

# It looks like that 1900-01-01 value is really throwing us off. Let's get rid of it. 

In [None]:
pivot.drop(pd.to_datetime('1900-01-01'),axis='index',inplace=True)

# we use pd.to_datetime to convert '1900-01-01' string to a datetime data type
# we use axis='index' to specify that we mean delete a row with that index value

In [None]:
pivot.plot(figsize=(10,5))

In [None]:
pivot.tail(30).plot() # let's look at the last 30 days

In [None]:
pivot.resample('1W').mean().tail(10) # use resample command to change frequency from one to 7 days, then compute the 
                                     # mean for these days (aka, the sum of total inspections)


In [None]:
pivot.resample('7D').mean().plot() # plot the number of inspections over 7 days

---

# Exercise 9: Plot the total number of inspections over 1 month

In [None]:
# your code here

# Solution 9: 

In [None]:
pivot.resample('1M').sum().plot()

---

# We can also add some basic titles to our plot.

In [None]:
plot = pivot.resample('7D').mean().plot()
plot.set_xlabel("Date of Inspection")
plot.set_ylabel("Average Number of Inspections (7-day average)")
plot.set_title("Analysis of Number of Inspections over Time")

---

# Exercise 10: Create a pivot table where we break down the results by boro

In [None]:
# your code here

# Solution 10: 

In [None]:
boro_pivot = pd.pivot_table(
    data = restaurants, #
    index = 'INSPECTION DATE', 
    columns = 'BORO', 
    values = 'CAMIS', 
    aggfunc = 'count'
)

boro_pivot.tail(10)

---

---

# Let's now take some time to explore Matplotlib

---

---

In [None]:
%matplotlib inline 
# the '%' is an example of a 'magic command' that allows us to make use of matplotlibs interactivity

import matplotlib.pyplot as plt
import numpy as np

In [None]:
np.random.normal(size=10)

In [None]:
t = np.arange(0., 5., 0.1) # every sampled value between 0 and 5, at .2 intervals

In [None]:
import math 

plt.plot(t, t, 'r--') # plot t as is with red dashes
plt.plot(t, t**2, 'bs') # plot t**2 with blue squares
plt.plot(t, t**1.5, 'g^') # plot t**1.5 with green triangles
plt.plot(t, 2*np.sin(5*t), 'm-') # plot 2**sin(5*t) with magenta line
plt.show()

# Note, there are lots of predefined styles available, too


In [None]:
print(plt.style.available)

In [None]:
xvalues = np.random.normal(size=1000) # an array of 1,000 floats
yvalues = np.random.normal(size=1000) # an array of 1,000 floats

plt.style.use(u'fivethirtyeight')
plt.plot(xvalues, yvalues, 'ro')
plt.xlabel("Style: fivethirtyeight")
plt.show()

# Let's break down all the possibilities with Matplotlib

In [None]:
fig = plt.figure(figsize=(10,6))

# Create the first subfigure
sub1 = fig.add_subplot(2,2,1)
sub1.set_xlabel('some random numbers')
sub1.set_ylabel('more random numbers')
sub1.set_title("Random scatterplot")
sub1.plot(np.random.randn(1000), np.random.randn(1000), 'r.')

# Create the second subfigure
sub2 = fig.add_subplot(2,2,2)
sub2.hist(np.random.normal(size=500), bins=15)
sub2.set_xlabel('sample')
sub2.set_ylabel('cumulative sum')
sub2.set_title("Normal distrubution")

# Create the third subfigure
numpoints = 100
x = np.linspace(0, 10, num=numpoints)
sub3 = fig.add_subplot(2,2,3)
sub3.plot(x, np.sin(x) + x + np.random.randn(numpoints), "r")
sub3.plot(x, np.sin(x) + 0.5 * x + np.random.randn(numpoints), "g")
sub3.plot(x, np.sin(x) + 2 * x + np.random.randn(numpoints), "b")
sub3.set_xlabel('x from 0 to 10')
sub3.set_ylabel('function value')

# Create the fourth subfigure
sub4 = fig.add_subplot(2,2,4)
x = np.random.randn(10000)
y = np.random.randn(10000)
sub4.hist2d(x,y,bins=100);
sub4.set_xlabel('x axis title')
sub4.set_ylabel('y axis title')

plt.tight_layout()
plt.savefig("normalvars.png", dpi=150)

# A bit more on what can be done...

In [None]:
# We can split multiple series into subplots with a single argument

variables = pd.DataFrame({'normal': np.random.normal(size=100), 
                       'gamma': np.random.gamma(1, size=100), 
                       'poisson': np.random.poisson(size=100)})

variables.cumsum(0).plot(subplots=True,figsize=(10,6))

In [None]:
# Or, have some series displayed on secondary y-axis

variables.cumsum(0).plot(secondary_y='normal')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
for i,var in enumerate(['normal','gamma','poisson']):
    variables[var].cumsum(0).plot(ax=axes[i], title=var)
axes[0].set_ylabel('cumulative sum (normal)')
axes[1].set_ylabel('cumulative sum (gamma)')
axes[2].set_ylabel('cumulative sum (poisson)')

# Let's check out a new data set

In [None]:
titanic = pd.read_csv('./titanic.csv')
titanic.head()

In [None]:
titanic.groupby('Pclass').Survived.sum() # How many survivors are there based on passenger class? 

In [None]:
titanic.groupby('Pclass').Survived.sum().plot(kind='bar')

In [None]:
titanic.groupby(['Sex','Pclass']).Survived.mean()

In [None]:
titanic.groupby(['Sex','Pclass']).Survived.mean().plot(kind='barh')

In [None]:
death_counts = pd.crosstab([titanic.Pclass, titanic.Sex], titanic.Survived.astype(bool))
death_counts

In [None]:
death_counts.plot(kind='bar', stacked=True, color=['red','blue'], grid=True)

In [None]:
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['red','blue'])

---

# Histograms

---

In [None]:
# How were fares distributed aboard the titanic? 

titanic["Fare"].hist()

---

# Exercise 12: How do we divide our histogram into 30 bins? 

In [None]:
# your code here

# Solution 12: 

In [None]:
titanic["Fare"].hist(bins=30)

---

# Density Plots

## Rather than purely represent the underlying data, this is an _estimate_ of the underlying true distribution.

---

In [None]:
titanic["Fare"].plot(kind='kde', xlim=(0,100), ylim=(0,0.05))

In [None]:
titanic["Fare"].plot(kind='kde', xlim=(0,600)) # notice what happens when we change our xlim

---

# Boxplots

## Think of a boxplot as viewing the data 'from above'. 

---

In [None]:
bp = titanic.boxplot(column='Age', by='Pclass', grid=False, figsize=(8,8))
for i in [1,2,3]:
    y = titanic.Age[titanic.Pclass==i].dropna()
    x = np.random.normal(i, 0.04, size=len(y)) # Add some random "jitter" to the x-axis

    plt.plot(x, y, 'r.', alpha=0.2)

---

# Scatterplots

---

In [None]:
fig = titanic.plot(kind="scatter", x='Age', y='Siblings/Spouses Aboard')

In [None]:
fig = titanic.plot(kind="scatter", x='Age', y='Siblings/Spouses Aboard',xlim=[0,100], ylim=[0,10],figsize=(5,5))

# We can even go so far as to assign variables to either the size or symbols of their colors

In [None]:
fig = titanic.plot(kind='scatter', x='Age', y='Siblings/Spouses Aboard', xlim=[0,100], ylim=[0,10], 
                    figsize=(5,5), c=titanic['Siblings/Spouses Aboard'], s=60, cmap='hot')

---

# Hexagonal Bin Plot

## This is perfect for when you have a larger number of points to display. It's also useful if your data are too dense to plot each point individually in a scatter plot.

---

In [None]:
df = pd.DataFrame( np.random.randn(10000, 2), columns=['a', 'b'])
df['b'] = df['b'] + np.arange(10000)

In [None]:
df.plot(kind='scatter', x='a', y='b', figsize=(6,4), alpha=0.1)

In [None]:
df.plot(kind='hexbin', x='a', y='b', gridsize=40,figsize=(8,4))

----