<a href="https://colab.research.google.com/github/agarwal-peeush/MachineLearning/blob/master/Python/Learn/4_Data_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic plotting, axes labels and titles - matplotlib library


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# plotting 2 1-D numpy arrays
x = np.linspace(5,100,100)
y = np.linspace(10,1000,100)

plt.plot(x,y)

In [0]:
# can also work with lists, though it converts lists to np arrays internally
plt.plot([1,4,6,8],[3,8,3,5])

In [0]:
plt.plot(x,y)

# x and y labels and title
plt.xlabel('Current')
plt.ylabel('Voltage')
plt.title("Ohm's law")

# Define range of labels of the axis
# arguments: plt.axis(xmin, xmax, ymin, ymax)
plt.xlim([20,80])
plt.ylim([200,800])
plt.show()

In [0]:
help(plt.plot)

In [0]:
# Change colors, line type

x = np.linspace(0,10, 20)
y = x*2

# color blue, line type '+'
plt.plot(x,y,'bo')
plt.xlabel('Current')
plt.ylabel('Voltage')
plt.title("Ohm's law")

plt.show()

In [0]:
# Plotting multiple lines on the same plot

x = np.linspace(0,5,10)
y = np.linspace(3,6,10)

# Plot three curves: y, y**2, y**3 with different line types
plt.plot(x,y,'r-',x,y**2,'b-.',x,y**3,'g^')
plt.show()

## Subplots

In [0]:
x = np.linspace(1,10,100)
y=np.log(x)

# Initiate  a new figure explicitly
plt.figure(1)

# Create a subplot with 1 row, 2 columns

# Create the first subplot in figure 1
plt.subplot(121) # Equivalent to plt.subplot(1,2,1) => 1 row, 2 columns, 1st figure
plt.title('y=log(x)')
plt.plot(x,y)

# Create the second subplot in figure 2
plt.subplot(122)
plt.title('y=log(x)**2')
plt.plot(x,y**2)

plt.show()

In [0]:
# Create a figure having 4 subplots

# optional command, since matplotlib creates a figure by default anyway
plt.figure(1)

# Subplot 1
plt.subplot(221)
plt.title('Linear')
plt.plot(x,x)

# Subplot 2
plt.subplot(222)
plt.title("Squared")
plt.plot(x,x**2)

# Subplot 3
plt.subplot(223)
plt.title('log')
plt.plot(x,np.log(x))

# subplot 4
plt.subplot(224)
plt.title('Cubic')
plt.plot(x,x**3)

plt.show()

## Common types of plots to show statistical data

In [0]:
# Load data from global_sales_data.market_fact.csv

df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')
df.head()

### 1. Boxplot
Displays the spread of data. Used to visualise the variance in the data, as well as to find information on what proportion of the data is above and below the median value.

In [0]:
# Boxplot: Visualise the distribution of a continuous variable
plt.boxplot(df['Order_Quantity'],sym='ro')
plt.show()

In [0]:
# Boxplot of Sales is quite unreadable, since Sales varies across a wide range
plt.boxplot(df['Sales'],sym='ro')
plt.show()
print(df['Sales'].describe())

In [0]:
# Solution to above plot: use log scale for y-axis
plt.subplot(121)
plt.boxplot(df['Sales'],sym='ro')

plt.subplot(122)
plt.boxplot(df['Sales'],sym='ro')
plt.yscale('log')
plt.show()

### 2. Histogram
Displays the frequency distribution of a single variable. Used to see what fraction of data is around which values

In [0]:
# Histogram: Useful for visualising distribution of single variables

plt.hist(df['Sales'])
plt.show()

In [0]:
# Histograms can be make more readable by using log scale

plt.hist(df['Sales'])
plt.yscale('log')
plt.show()

### 3. Scatter plots
Displays the scatter of the data. Used to detect outliers. Points to the naturally occuring clusters in the data. 

Visualise 2 variables, one on each axis

In [0]:
plt.scatter(df['Sales'],df['Profit'])
plt.show()

# Visualising Distributions of data - Seaborn library

## 1. Visualising Univariate Distributions

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# The commonly used alias for seaborn is sns
import seaborn as sns

# Set a seaborn style of your taste
sns.set_style("whitegrid")

# data
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/global_sales_data/market_fact.csv')

### Histograms and density plots
These plots give you detailed information on the spectrum of values that your data spread across. 

In [0]:
# simple density plot
sns.distplot(df['Shipping_Cost'])
plt.show()

In [0]:
# Rug plot
sns.distplot(df['Shipping_Cost'][:200],rug=True)
plt.show()

In [0]:
sns.distplot(df['Sales'], bins=50)
plt.show()

In [0]:
# Simple density plots without histogram bars
sns.distplot(df['Sales'],hist=False)
plt.show()

In [0]:
# Subplots

plt.subplot(221)
plt.title('Sales')
sns.distplot(df['Sales'])

plt.subplot(222)
plt.title('Profit')
sns.distplot(df['Profit'])

plt.subplot(223)
plt.title('Order quantity')
sns.distplot(df['Order_Quantity'])

plt.subplot(224)
plt.title('Shipping Cost')
sns.distplot(df['Shipping_Cost'])

plt.show()

In [0]:
sns.distplot(df['Sales'][:1000], rug=True)
plt.show()

### 2. Boxplot

In [0]:
plt.subplot(121)

sns.boxplot(df['Order_Quantity'])
plt.title('Order quantity')

plt.subplot(122)
sns.boxplot(y=df['Order_Quantity'])
plt.title('Order quantity')

plt.show()

## 2. Visualising Bivariate Distributions

In [0]:
# Joint plots of Sales and Profit
sns.jointplot(df['Sales'],df['Profit'])
# Equivalent to sns.jointplot('Sales','Profit', df)
plt.show()

In [0]:
# remove point havng extreme values

df_1 = df[(df.Profit < 10000) & (df.Sales < 20000)]

sns.jointplot('Sales','Profit',df_1)
plt.show()

In [0]:
# another variant of jointplot
df_2 = df[(df.Profit < 100) & (df.Profit > -100) & (df.Sales < 200)]
sns.jointplot('Sales','Profit',df_2, kind='hex',color='k')
plt.show()

In [0]:
# Load crypto data
# reading cryptocurrency files
btc = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/bitcoin_price.csv")
ether = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/ethereum_price.csv")
ltc = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/litecoin_price.csv")
monero = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/monero_price.csv")
neo = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/neo_price.csv")
quantum = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/qtum_price.csv")
ripple = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Datasets/crypto_data/ripple_price.csv")

# putting a suffix with column names so that joins are easy
btc.columns = btc.columns.map(lambda x: str(x) + '_btc')
ether.columns = ether.columns.map(lambda x: str(x) + '_et')
ltc.columns = ltc.columns.map(lambda x: str(x) + '_ltc')
monero.columns = monero.columns.map(lambda x: str(x) + '_mon')
neo.columns = neo.columns.map(lambda x: str(x) + '_neo')
quantum.columns = quantum.columns.map(lambda x: str(x) + '_qt')
ripple.columns = ripple.columns.map(lambda x: str(x) + '_rip')

btc.head()

In [0]:
# merging all the files by date
m1 = pd.merge(btc, ether, how="inner", left_on="Date_btc", right_on="Date_et")
m2 = pd.merge(m1, ltc, how="inner", left_on="Date_btc", right_on="Date_ltc")
m3 = pd.merge(m2, monero, how="inner", left_on="Date_btc", right_on="Date_mon")
m4 = pd.merge(m3, neo, how="inner", left_on="Date_btc", right_on="Date_neo")
m5 = pd.merge(m4, quantum, how="inner", left_on="Date_btc", right_on="Date_qt")
crypto = pd.merge(m5, ripple, how="inner", left_on="Date_btc", right_on="Date_rip")

crypto.head()

In [0]:
# Subsetting only the closing prices column for plotting
curr = crypto[["Close_btc", "Close_et", 'Close_ltc', "Close_mon", "Close_neo", "Close_qt"]]
curr.head()

### Pairwise scatter plots


In [0]:
# Pairwise scatter plots

sns.pairplot(curr)
plt.show()

### HeatMap

In [0]:
# Observe correlation between the currencies using df.corr()
cor = curr.corr()
round(cor, 3)

In [0]:
# figure size
plt.figure(figsize=(10,8))

# heatmap - use colormap and annotations
sns.heatmap(cor, cmap='YlGnBu', annot=True)
plt.show()