### <font color="brown">Visualization - continued</font>

In [None]:
from matplotlib import pyplot as plt

---

#### <font color="brown">Plot Grid</font>

**Creating a grid of subplots**

https://python-course.eu/numerical-programming/creating-subplots-in-matplotlib.php

In [None]:
fig, axes = plt.subplots(1,2)  # 1 row, 2 columns

**Change figure size to widen it, and introduce padding between sub plot**

In [None]:
fig, axes = plt.subplots(1,2,figsize=(8,4))
fig.tight_layout(pad=3.0)

In [None]:
# set yticks for the first figure
fig, axes = plt.subplots(1,2,figsize=(8,4))
fig.tight_layout(pad=3.0)
axes[0].set_yticks(range(0,51,10))  # for the axes object, you need to use set_yticks (yticks won't work)
plt.show()

#### Do subplots, sharing x and y axes

In [None]:
# do plots for Apple and Facebook
# share the y-axes and x-axes
fig, axes = plt.subplots(1,2,figsize=(10,4),sharey=True,sharex=True)
fig.tight_layout(pad=3.0)

years = [2015,2016,2017,2018,2019]

# first figure for Apple stock
apple_share_price = [105,116,169,148,297]
axes[0].plot(years,apple_share_price)
axes[0].set_title('Apple Stock')

# second figure for Facebook stock
facebook_share_price = [104,115,176,134,208]
axes[1].plot(years,facebook_share_price)
axes[1].set_title('Facebook Stock')

# you can set the tick labels to be different than the tick values
axes[0].set_xticks([2015,2016,2017,2018,2019])
axes[0].set_xticklabels(
    ['Dec\n2015','Dec\n2016','Dec\n2017','Dec\n2018','Dec\n2019'])

plt.show()

In [None]:
# do plots for Apple and Facebook
# share the y-axes and x-axes
fig, axes = plt.subplots(2,2,figsize=(10,8),sharey=True,sharex=True)
fig.tight_layout(h_pad=5.0,w_pad=3.0)

years = [2015,2016,2017,2018,2019]

# first figure for Apple stock
apple_share_price = [105,116,169,148,297]
axes[0][0].plot(years,apple_share_price)
axes[0][0].set_title('Apple Stock')

# second figure for Facebook stock
facebook_share_price = [104,115,176,134,208]
axes[0][1].plot(years,facebook_share_price)
axes[0][1].set_title('Facebook Stock')

# third figure for Google
google_share_price = [766,830,1053,1046,1354]
axes[1][0].plot(years,google_share_price)
axes[1][0].set_title('Google Stock')

# last subplot unused, turn it off
axes[1][1].set_axis_off()

# x ticks and labels for all
axes[0][0].set_xticks([2015,2016,2017,2018,2019])
axes[0][0].set_xticklabels(
            ['Dec\n2015','Dec\n2016','Dec\n2017','Dec\n2018','Dec\n2019'])


plt.show()

**In the above, since the subplot [1][1] is turned off, the x-axis ticks aren't set and can't be shared with the [0][1] subplot above it. So we need to explicitly set ticks for the [0][1] subplot.**

In [None]:
# do plots for Apple and Facebook
# share the y-axes and x-axes
fig, axes = plt.subplots(2,2,figsize=(10,8),sharey=True)
fig.tight_layout(h_pad=5.0,w_pad=3.0)

years = [2015,2016,2017,2018,2019]

# first figure for Apple stock
apple_share_price = [105,116,169,148,297]
axes[0][0].plot(years,apple_share_price)
axes[0][0].set_title('Apple Stock')

# second figure for Facebook stock
facebook_share_price = [104,115,176,134,208]
axes[0][1].plot(years,facebook_share_price)
axes[0][1].set_title('Facebook Stock')

# third figure for Google
google_share_price = [766,830,1053,1046,1354]
axes[1][0].plot(years,google_share_price)
axes[1][0].set_title('Google Stock')

# last subplot unused, turn it off
axes[1][1].set_axis_off()

# x ticks and labels for all
for i in (0,1):
    for j in (0,1):
        axes[i][j].set_xticks([2015,2016,2017,2018,2019])
        axes[i][j].set_xticklabels(
            ['Dec\n2015','Dec\n2016','Dec\n2017','Dec\n2018','Dec\n2019'])

plt.show()

---

---

### <font color="brown">Data Curation and Exploration</font>

In [None]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series

#### <font color="brown">Titanic Dataset from Kaggle</font>
https://www.kaggle.com/c/titanic

**On the Kaggle page above, if you look at the Data tab, you will see files named train.csv (training set), and test.csv (test set). The training set has 891 instances, and the test set has 418 instances. The training set was downloaded and renamed as titanic_train.csv**

In [None]:
titanic_file = 'titanic_train.csv'
titanic = pd.read_csv(titanic_file)

In [None]:
titanic

**An explanation for the columns is given in the Data tab, under Data Dictionary**

In [None]:
# column data information
titanic.info()

##### **Cabin (only 204/891) and Age (only 714/891) have lots of missing values, and Embarked has 2 missing values**

---

##### <font color="brown">Missing values in numeric columns</font>

In [None]:
titanic.describe()

##### **AGE: mean is about 29, median (50% cutoff) is 28. Let's replace missing values with mean or median, since they are about the same.**

In [None]:
titanic[titanic['Age'].isnull()]

In [None]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

In [None]:
# reveiw info, all Age columns should now be filled
titanic.info()

In [None]:
titanic.loc[[5,17,19,26,28]]  # verify 

---

##### <font color="brown">Missing values in non-numeric (categorical) columns</font>

##### **Embarkation ports**

In [None]:
titanic[titanic['Embarked'].isnull()]

In [None]:
# let's see what is the distribution of Embarked values
embarked_vc = titanic['Embarked'].value_counts()
print(embarked_vc)

In [None]:
# plot bar chart for counts at embarkation ports
plt.figure(figsize=(3,4))
plt.bar(range(len(embarked_vc)),embarked_vc.values, width=0.6)
plt.xticks(range(len(embarked_vc)),embarked_vc.index)
plt.title('Embarkation',fontsize=14)
plt.xlabel('Port',fontsize=12)
plt.ylabel('Number of Passenger',fontsize=12)
plt.show()

##### **<font color="brown">Can also plot directly off Pandas</font>**

In [None]:
# first make a dataframe just for the embarkation ports data
edf = pd.DataFrame(embarked_vc)
print(edf)

In [None]:
# we need the S,C,Q index values to be in a separate column
edf = edf.reset_index()  
edf.columns = ['Port','Passengers']
print(edf)

In [None]:
axis = edf.plot(x='Port',y='Passengers',kind='bar',rot=1,title='Embarkation',figsize=(4,4))
# rot is rotation for x-ticks, see what happens when you take it out

In [None]:
# remove legend, label the y axis
axis = edf.plot(figsize=(4,4),x='Port',y='Passengers',kind='bar',rot=1)
axis.set_title('Embarkation',fontsize=14)
axis.set_ylabel('Passengers')
axis.legend('') 

##### **An often used option for replacement of null categorial values is to use the most frequent value**

In [None]:
# replace the missing Embarked values with 'S'
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[[61, 829]]

In [None]:
# verify that count for S has been updated, from 644 to 646
titanic['Embarked'].value_counts()

---

##### <font color="brown">Plot the age distribution<font>

In [None]:
titanic.describe()

In [None]:
titanic.hist(column='Age')

In [None]:
titanic.hist(column='Age',grid=False,color='#76c9ef',edgecolor='#056590')

---

#### <font color="brown">Visualizing the IRIS Dataset from UCI</font>
https://archive.ics.uci.edu/ml/datasets/Iris

**Load data into DataFrame from URL**

In [None]:
IRIS_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns = ['sepal_length','sepal_width','petal_length','petal_width','iris-type']
iris = pd.read_csv(IRIS_URL,names=columns)

In [None]:
iris

In [None]:
iris.describe()

In [None]:
iris['iris-type'].value_counts()

**Plot sepal width vs sepal length for each of the iris types<br>
Extract subset DataFrames for each of the iris types, and plot directly off DataFrame**

In [None]:
iris_types = iris['iris-type'].unique()
colors = ['blue','red','green']
for iris_type,color in zip(iris_types,colors):
    df = iris[iris['iris-type'] == iris_type]    
    df.plot('sepal_width','sepal_length',kind='scatter',color=color)

**Do all of them in the same plot, for easy comparison**

In [None]:
fig, ax = plt.subplots(1,1)
fig.set_size_inches(5,5)
for iris_type,color in zip(iris_types,colors):
    df = iris[iris['iris-type'] == iris_type]    
    df.plot('sepal_width','sepal_length',ax=ax,kind='scatter',color=color,label=iris_type[5:])
# ax.legend(loc='center right')


**Repeat for sepal length vs petal length and petal width<br>
Share the y axis since they are all sepal length**

In [None]:
fig, axes = plt.subplots(1,3,sharey=True)
fig.set_size_inches(15,5)
axsubs = [0,1,2]
for x,axsub in zip(['sepal_width','petal_length','petal_width'],axsubs):
    for iris_type,color in zip(iris_types,colors):  
        df = iris[iris['iris-type'] == iris_type]    
        df.plot(x,'sepal_length',ax=axes[axsub],kind='scatter',color=color,label=iris_type[5:])

# remove legends for the first and last plots
axes[0].get_legend().remove()
axes[2].get_legend().remove()

---

---

#### <font color="brown">Exploring the California Housing Dataset</font>

#### Presented in Aurelien Geron's book "Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow"
https://github.com/ageron/handson-ml2/tree/master/datasets/housing

Also see:<br>
https://developers.google.com/machine-learning/crash-course/california-housing-data-description

**Get the housing data<br>**
URL: https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz

In [None]:
import urllib, tarfile

housing_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
urllib.request.urlretrieve(housing_url, 'housing.tgz')
housing_tgz = tarfile.open('housing.tgz')
housing_tgz.extractall()
housing_tgz.close()

**Load the extracted csv file into a DataFrame**

In [None]:
housing = pd.read_csv('housing.csv')

##### **Big Picture**

In [None]:
housing.head(10)

**See https://developers.google.com/machine-learning/crash-course/california-housing-data-description<br>
for a description of each of these columns. In particular, median income is in tens of thousands of dollars.**

In [None]:
housing.info()

**The only column that has null values is total_bedrooms**

---

##### **Ocean Proximity**

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
ocp = DataFrame(housing['ocean_proximity'].value_counts())
ocp

In [None]:
ocp = ocp.reset_index()
ocp.columns = ['Ocean Proximity','Blocks']
ocp

In [None]:
axis = ocp.plot(figsize=(6,4),x='Ocean Proximity',y='Blocks',kind='bar',rot=1)
axis.set_ylabel('Blocks',fontsize=14)
axis.set_xlabel('')
axis.set_title('Ocean Proximity',fontsize=14)
axis.legend('') 

---

##### **Histograms for numerical attributes**

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))

---

##### **Income Categories**
Make a new column for income categories by partitioning median income into 5 classes: 1,2,3,4,5 for incomes in the ranges (0, 1.5), (1.5, 3.0), (3.0, 4.5), (4.5, 6.0), 6.0 and over. Use **Pandas.cut**
method

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                               labels=[1,2,3,4,5])

In [None]:
housing.head(5)

In [None]:
housing.info()

In [None]:
ic_vals = housing['income_cat'].value_counts()
print(ic_vals)

In [None]:
plt.bar(ic_vals.index, ic_vals.values)
plt.xlabel("Income Category")
plt.ylabel("Households")
plt.show()

---

##### **Longitude-Latitude Map like plot**

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude')

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)

In [None]:
# heat map
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
             s=housing['population']/100,label='population',
             figsize=(10,7),c='median_house_value',
             colorbar=True)

---