# Matplotlib Intro

<font color='steelblue'>

<font size = 5>
    
**Examples of using Matplot library**<br><br>
</font>
<font size = 4>
    <b>Following examples are included in the processing:</b>
    <ol>
        <li>Figures and Axes</li>
        <li>Line Plots</li>
        <li>Scatter Plots</li>
        <li>Histograms</li>
        <li>Pie Charts</li>
        <li>Pair Plots</li>
        <li>Plotting using seaborn library</li>
        <li>Time Series Data Plotting</li>
    </ol>    
</font>
</font>

# Check PySpark Set Up

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Matplotlib Demo").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [None]:
!python --version

In [None]:
!pyspark --version

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# embed the plots in notebook
#%matplotlib inline%

# Use either of these lines for grids or no grids
#plt.style.use('seaborn-whitegrid')    # grids in the plots
#plt.style.use('seaborn-white')       # no grids in plots

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Figures and Axes

In [None]:
# Figure and axes
fig = plt.figure()
ax = plt.axes()

In [None]:
# Another way
fig, ax = plt.subplots()  

## Line Plots

In [None]:
x = np.linspace(0, 10, 100)
x

In [None]:
# Take cosine and sine of x
y1, y2 = np.cos(x), np.sin(x)
fig, ax = plt.subplots()
ax.plot(x, y1)
ax.plot(x, y2)

In [None]:
# with show called on plt
y1, y2 = np.cos(x), np.sin(x)
fig, ax = plt.subplots()
ax.plot(x, y1)
ax.plot(x, y2)
plt.show()

In [None]:
# Stack Graphs vertically (defining 2 rows for plot)
# Use semi-colon in the last statement (equivalent as plt.show())
fig, axes = plt.subplots(2)
axes[0].plot(x, y1)
axes[1].plot(x, y2);

In [None]:
# Stack Graphs horizontally (1 row, 2 columns, and change figure size)
fig, axes = plt.subplots(1,2, figsize=(22,4))

axes[0].plot(x, y1)
axes[1].plot(x, y2, color = 'orange')
plt.show()

## Scatter Plots

In [None]:
x = np.random.rand(30)
y = np.random.rand(30)
x, y

In [None]:
# s is marker size in points
# alpha is intensity of the marker o for transparent and 1 for opaque
plt.scatter(x, y, marker='o', color = 'red', s = 200, alpha=.4)
plt.title("scatter plot")
plt.show()

<font size = 4>
    
Try changing the `aplha` and the `s` on the above code and execute the cell
    
</font>

In [None]:
# get iris_data
iris_data = pd.read_csv("../datasets/iris.csv", 
                        names = ["sepal_l", "sepal_w", "petal_l", \
                                 "petal_w", "class"])

# Transform categorical 'class' to integers
iris_data['class'] = iris_data['class'].map({"Iris-setosa":0, \
                                             'Iris-versicolor':1, \
                                             'Iris-virginica': 2})
iris_data.head()

In [None]:
# cmap is Colormap magma
# c is use the different colors for types of flowers
plt.figure(figsize = (6,5))
plt.scatter(iris_data['sepal_l'], iris_data['sepal_w'], alpha=0.8,
            s=200, c=iris_data['class'], cmap='magma')

plt.xlabel('sepal_l')
plt.ylabel("sepal_w")
plt.title("Sepal Length v/s Sepal Width")
plt.show()

In [None]:
# Another way of plotting the above graph
fig, ax = plt.subplots(figsize = (10, 8))
ax.scatter(iris_data['sepal_l'], iris_data['sepal_w'], alpha=.7,
           s = 200, c = iris_data["class"],  cmap='cividis')

# more convenient way of setting properties
ax.set(title="Iris Data", xlabel='sepal_l', ylabel="sepal_w")

plt.show()

In [None]:
# Define multiple scatter plots and provide legend
fig, ax = plt.subplots(figsize = (9, 8))

ax.scatter(iris_data.loc[:49, 'sepal_l'], iris_data.loc[:49,'sepal_w'],\
           s = 200, c = 'r', alpha=.5, label='Iris-setosa')

ax.scatter(iris_data.loc[49:99, 'sepal_l'], iris_data.loc[49:99,'sepal_w'], \
           s = 200, c = 'blue', alpha=.5, label='Iris-versicolor')

ax.scatter(iris_data.loc[99:, 'sepal_l'], iris_data.loc[99:,'sepal_w'], \
           s = 200, c = 'green', alpha=.5, label='Iris-virginica')

# more convenient way of setting properties
ax.set(title="Iris Data", xlabel='sepal_l', ylabel="sepal_w")
ax.legend()

plt.show()

## Histogram

In [None]:
fig, ax = plt.subplots()
#ax.hist(iris_data['sepal_l'], color = 'xkcd:lilac', edgecolor='black')
ax.hist(iris_data['sepal_l'], color = 'lightgreen', edgecolor='black')
ax.set_title("Histogram of Sepal Length")
ax.set_xlabel("Length (cms)")
ax.set_ylabel("Count")
plt.show()

In [None]:
# Plot histogram for each column and provide legend
fig, ax = plt.subplots(figsize = ((12, 8)))
ax.hist(iris_data['sepal_l'], alpha=.5, edgecolor='black')
ax.hist(iris_data['sepal_w'], alpha=.5, edgecolor='black')
ax.hist(iris_data['petal_l'], alpha=.5, edgecolor='black')
ax.hist(iris_data['petal_w'], alpha=.5, edgecolor='black')
ax.legend(['sepal_l', 'sepal_w', 'petal_l', 'petal_w'])
ax.set_xlabel("Length (cms)")
ax.set_ylabel("Count")
plt.show()

In [None]:
iris_data.columns

## Pair Plots

In [None]:
# Pair plotting
features = ["sepal_l", "sepal_w", "petal_l", "petal_w"]
pair_plot = pd.plotting.scatter_matrix(iris_data[features], figsize=(12, 8))

## Pie Chart

In [None]:
fig, ax = plt.subplots(figsize = ((12, 8)))
values = [12, 55, 4, 32, 14]
colors = ['r', 'g', 'b', 'c', 'm']
explode = [0, 0, 0.2, 0, 0]
labels = ['India', 'USA', 'China', 'UK', 'Japan']
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('Students From');

## [XKCD Style](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.xkcd.html)

In [None]:
plt.xkcd()
fig = plt.figure(figsize = ((10, 6)))
ax = fig.add_subplot(1, 1, 1)
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.xticks([])
plt.yticks([])
ax.set_ylim([-30, 10])

data = np.ones(100)
data[70:] -= np.arange(30)
print(data)
plt.annotate(
    'LEARNT MACHINE LEARNING WITH SPARK AND SCI-KIT LEARN',
    xy = (70,1), arrowprops=dict(arrowstyle='->', color='black'), xytext = (15, -10))

plt.plot(data)
plt.xlabel('Time')
plt.ylabel('My Machine Learning');

In [None]:
# Remove the XKCD Mode
plt.rcdefaults()

## Seaborn library

In [None]:
# Another library to plot pairs (hue column for which to assign colors)
# Useful when target variable is multi-class
import seaborn as sns

pair_plot = sns.pairplot(iris_data, hue='class')

In [None]:
# Transform integers back to categorical, set the style to dark and change the diagonal plots
sns.set(style = "dark")
iris_data['class'] = iris_data['class'].map({0:"Iris-setosa", \
                                             1:'Iris-versicolor', \
                                             2:'Iris-virginica'})
pair_plot = sns.pairplot(iris_data, hue='class', diag_kind = 'hist')

In [None]:
sns.set(style = "dark")
pair_plot = sns.pairplot(iris_data, hue='class', markers = ['.', 'o', '^'])

In [None]:
# box and whisker plot
plt.figure(figsize = (8, 4))
sns.boxplot(data = iris_data, width = 0.6, fliersize = 5);

In [None]:
# Correlation
corr = iris_data.corr()
corr

In [None]:
f, ax = plt.subplots(figsize=(8,6))
sns.heatmap(corr, annot=True)
plt.show()

<font color='gray'>
<font size = 4>
    <b>Heatmap and Color Palette</b>
    <br>
    
[Color Palette](https://seaborn.pydata.org/generated/seaborn.diverging_palette.html)

[Seaborn Heatmap](https://seaborn.pydata.org/generated/seaborn.heatmap.html)
</font>
</font>

In [None]:
sns.palplot(sns.diverging_palette(240, 10, n=9))

In [None]:
sns.set(font_scale=1.4)
f, ax = plt.subplots(figsize=(8,6))
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)
plt.show()

<font color='gray'>
<font size = 5>
    
**Time Series Data Plotting**
    
</font>
</font>
<br>

<font size = 3>

- Plot stocks `opening data`
- Plot `2 days mean` open values for certain company
- Plot `bi-weekly mean` open values for certain company
- Use a `certain days window` to plot open values
</font>


In [None]:
stocks = pd.read_csv('../datasets/stocks.csv', index_col = 'Date', parse_dates = ['Date'])

In [None]:
stocks.head()

In [None]:
stocks['Company'].unique()

## Dataframes for each company

In [None]:
amzn = stocks.query('Company == "AMZN"')

In [None]:
apple = stocks.query('Company == "AAPL"')

In [None]:
facebook = stocks.query('Company == "FB"')

In [None]:
sbucks = stocks.query('Company == "SBUX"')

In [None]:
tesla = stocks.query('Company == "TSLA"')

In [None]:
# Plot dataframes
plt.subplots(figsize = [10, 8])
apple.Open.plot()
amzn.Open.plot()
facebook.Open.plot()
sbucks.Open.plot()
tesla.Open.plot()
plt.legend(["Apple", "Amazon", "Facebook", "Star Bucks", "Tesla"])
plt.show()

In [None]:
# Sample data from 2 days
amzn["Open"].resample("2D").apply([np.mean]).plot()
plt.title("Amazon Open 2 days mean ")
plt.ylabel("Amazon Open")
plt.show()

In [None]:
# Sample bi-weekly data
amzn["Open"].resample("B").apply([np.mean]).plot()
plt.title("Amazon bi-weekly mean")
plt.ylabel("Amazon Open")
plt.show()

In [None]:
# create a pandas series object for amazon open
amznOpen = stocks.loc[stocks["Company"] == 'AMZN', "Open"]

In [None]:
amznOpen.head(2)

In [None]:
# Window based plotting
pd.Series.rolling(amznOpen, window=25).mean().plot(style="-g")
plt.title("Amazon Open Mean 25 day window")
plt.ylabel("Amazon Open")
plt.show()

In [None]:
# Window based plotting
pd.Series.rolling(amznOpen, window=25).mean().plot(style="-g")
plt.title("Amazon Open Mean 25 day window")
plt.ylabel("Amazon Open")
plt.savefig("AMZN-25.png")