# Hello World Welcome to Data Vizualization with Python

### A simple plot example for y-axis values

In [1]:
import matplotlib.pyplot as plt
plt.plot([1,2,3,4])
plt.ylabel('y-axis label')
plt.xlabel('x-axis label')
# plt.show()

<matplotlib.text.Text at 0x7f80c6957c50>

### How to pass x-axis values ?

In [2]:
plt.grid(True) #show grid on graph
plt.plot([1, 2, 3, 4], [1, 4, 9, 16]) # y = x^2 equation plotted here
# plt.show()

[<matplotlib.lines.Line2D at 0x7f80c68f1650>]

## Change line color and style ?

In [1]:
# plt.plot([1, 2, 3, 4], [1, 4, 9, 16], 'r--') # red dashed line, try 'bs' and 'g^' ?
# plt.show()

## Works with numpy module as well

In [3]:
import numpy

x = numpy.arange(1, 10, 0.5) #create a numpy array in range 1 to 10 at intervals of 0.5

plt.plot(x, x**3, 'go') #y = x^3 plot

# plt.show()

[<matplotlib.lines.Line2D at 0x7f80c68f1dd0>]

## Plot multiple graphs on same graph ?

In [4]:
x = numpy.arange(1, 10, 0.5) #create a numpy array in range 1 to 10 at intervals of 0.5

#multiple lines on same graph (assume same scale on axes)
plt.plot(x, x**3, 'go', x, x**2, 'r--', x, x, 'b-') #y = x^3 plot

# plt.show()

[<matplotlib.lines.Line2D at 0x7f80c69075d0>,
 <matplotlib.lines.Line2D at 0x7f80c6907650>,
 <matplotlib.lines.Line2D at 0x7f80c6907cd0>]

## Control linewidth of a line ?

In [5]:
plt.plot(x, x, linewidth=10) #just use the keyword argument linewidth
# plt.show()

[<matplotlib.lines.Line2D at 0x7f80c6907310>]

## Range of values on axes 

In [6]:
#If no range is specified matplotlib takes the max and min from available data

#create a range of values for x-axis
x = numpy.arange(1,5,1) #Caution: careful with spelling here !

#plot the graph
plt.plot(x, x**2)

#specify maximum axes values
plt.axis([0, 6, 0, 20]) # [xmin, xmax, ymin, ymax]

#turn on grid for visualization
plt.grid(True) #order of methods does not matter as long as before .show()

#see what you got ?
# plt.show()

## How do I save my graphs ??

In [8]:
# This is easy, just one condition to be followed
# Use matplotlib backend 'engines'
# This might be bit tricky so pay attention :)

# IMPORTANT: You need to specify this before importing pyplot from matplotlib

# Some popular ones are: 
# AGG - 
# PS - postscript format
# PDF - Portable Document file (.pdf)

import matplotlib

matplotlib.use('AGG') # This is required to before importing pyplot

import matplotlib.pyplot as plt

#lets plot and save our graph

x = numpy.arange(1,5,0.1) # 1 to 5 at intervals of 0.1

y = x**2 #y = x^2

# create a figure
# this is like a blank canvas where you can have multiple graphs
fig = plt.figure()

#add a subplot to the figure
ax = fig.add_subplot(111)

#Now plot a scatter plot on the ax subplot
ax.scatter(x, y)

# save the fig that we created
# subplot gets embedded in that
fig.savefig('graph')

#close the image file, no more edits can be made after this
# plt.close(fig)

## Complex mathematical models 

In [9]:
"""
matplotlib can be passed input in a variety of ways
"""

import numpy as np
import matplotlib
matplotlib.use('PDF') #let us save a pdf this time
import matplotlib.pyplot as plt

# some complex value transformation
# you can define your own as well !
def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

#create two x-axis arrays
x1 = np.arange(0.0, 5.0, 0.1)
x2 = np.arange(0.0, 5.0, 0.02)

#specify dimensions to avoid small figure
fig = plt.figure(figsize=(12, 8))

#create a subplot
# a1 = fig.add_subplot(211)

# two different domains on same subplot
# note that y-axis argument is a python method
# For each x-axis value f() is called to get y value 

# a1.plot(x1, f(x1), 'ro', x2, f(x2), 'g--')

#create one more subplot
# a2 = fig.add_subplot(212) # we examine these numeral arguments in a bit

#plot the second subplot as a sine curve
# a2.plot(x2, 5 * x2)
# a2.set_ylabel('cost')
# a2.set_xlabel('quantity')

#save the figure
#NOTE: You need to specify format because default is still png
# fig.savefig('multigraph.pdf', format='pdf')


#close the figure
# plt.close(fig)

## 1. What is 212 in fig.add_subplot(212) ?

It defines the 'figure' as a grid of rectangles.

It is equivalent to `fig.add_subplot(2, 1, 2)`

Read it as **On a figure of 2 rows and 1 column, add the subplot in 2nd grid**

# Scatter Plot

In [10]:
import pandas
import matplotlib.pyplot as plt

#read csv file using pandas
# cars = pandas.read_csv('cars.csv')

#peek at first 5 rows
# cars.head()

In [147]:
city_mpg = cars['city-mpg'].values #convert pandas series to a numpy array

highway_mpg = cars['highway-mpg'].values #convert pandas series to a numpy array

In [12]:
#Now scatter plot
# plt.scatter(city_mpg, highway_mpg, color='red')
# plt.title('Highway MPG v/s City MPG')
# plt.ylabel('Highway MPG')
# plt.xlabel('City MPG')
# plt.show()

# Bar Graph

In [171]:
#Lets take a count of how many cars run on diesel and gas respectively

fuel_counts = cars['fuel-type'].value_counts()
print fuel_counts

# extract different type of fuels
fuel = list(fuel_counts.keys())
print fuel

#for each fuel type get the count in a separate list
counts = [fuel_counts[ft] for ft in fuel]
print counts

gas       185
diesel     20
Name: fuel-type, dtype: int64
['gas', 'diesel']
[185, 20]


In [17]:
#make the bar graph

#Can you guess why we are doing this step ?
# x_pos = np.arange(len(fuel))

#make bar graph of counts v/s x-pos
# plt.bar(x_pos, counts, align='center', alpha=0.5)

#assign labels on x-pos
# plt.xticks(x_pos, fuel)

#label on y-axis
# plt.ylabel('Counts')

#let see our creation
# plt.show()

# Histogram

In [24]:
# prices = cars['price'].tolist()

# slight data cleaning is required for price column
# prices = [int(price) for price in prices if price != '?'] #convert to integers, ignore unknown prices

In [21]:
#plot histogram
# price_bins = range(0, 50001, 10000) # 0 to 50000 in 10000 steps
# print price_bins

#hist itself takes counts in each range of bins
# plt.hist(prices, bins=price_bins, color='orange')

# plt.show()