In [None]:
%matplotlib inline

from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

import matplotlib.pylab as plt

In [None]:
data_df = pd.read_csv("WestRoxbury_V2.csv")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Show the first 10 rows of the dataset

![Data%20Viz%201.png](attachment:Data%20Viz%201.png)

In [None]:
# Show the last 10 rows of the dataset

![Data%20Viz%202.png](attachment:Data%20Viz%202.png)

In [None]:
# Show the descriptive statistics for numeric variables

![Data%20Viz%203a.png](attachment:Data%20Viz%203a.png)

In [None]:
# Show the variables names, non-null values, and datatypes

![Data%20Viz%203.png](attachment:Data%20Viz%203.png)

In [None]:
# There is a YR BUILT value of zero for one of the samples (a data entry error that must be corrected)

data_df = data_df[data_df['YR BUILT'] > 0] # only keep all samples where YR BUILT > 0

In [None]:
# Show the counts per class of REMODEL

data_df['REMODEL'].value_counts()

In [None]:
# Some characters cause problems in names, so we change those characters (empty space and hyphens) into underscores

data_df.columns = [s.strip().replace(' ', '_') for s in data_df.columns] # change spaces into underscores in variable names
data_df.columns = [s.strip().replace('-', '_') for s in data_df.columns] # change hyphens into underscores in variable names
data_df.columns

#### Scatterplots

In [None]:
# the basic pandas scatterplot

data_df.???.???(x='GROSS_AREA', y='TOTAL_VALUE', legend=False, color='mediumblue')

![Data%20Viz%204.png](attachment:Data%20Viz%204.png)

In [None]:
# Matplotlib scatterplot

fig, ax = plt.subplots() # extracts two elements, fig (the picture itself) and ax (the plot or graph)

fig.set_size_inches(???, ???) # sets the size of the plot to 10 x 6 inches

ax.scatter(data_df.GROSS_AREA, data_df.???, color='steelblue', facecolor='orangered') # assigns scatterplot

plt.xlabel('Gross Area') # axis labels may be different than the variable names
plt.ylabel('Total Value')

plt.tight_layout() # makes the size of the plot the same as the frame (no margin)

plt.show() # this is required to display the plot

![Data%20Viz%205.png](attachment:Data%20Viz%205.png)

In [None]:
# generate a scatterplot showing lot size versus year built

data_df.plot.scatter(x=???, y=???, legend=False, color='mediumblue')

![Data%20Viz%206.png](attachment:Data%20Viz%206.png)

In [None]:
# log transform variable(s) before plotting to reveal patterns

data_df['LOT_SQFT_log'] = np.log((data_df.LOT_SQFT)+1)

data_df['YR_BUILT_log'] = np.log((data_df.YR_BUILT)+1)

# the basic pandas scatterplot
data_df.plot.scatter(x='LOT_SQFT_log', y='YR_BUILT_log', legend=False, color='mediumblue')

![Data%20Viz%207.png](attachment:Data%20Viz%207.png)

In [None]:
# Add a categorical variable to a scatterplot

data_df.plot.scatter(x='GROSS_AREA', y='TOTAL_VALUE', 
                     c=['darkorange' if c == "None" else 'royalblue' for c in data_df.???])

![Data%20Viz%208.png](attachment:Data%20Viz%208.png)

#### Bar Charts

In [None]:
ax = data_df.groupby('REMODEL').mean().???.plot(kind='bar', figsize=[???, ???], color='orangered')

ax.set_ylabel('Avg. TOTAL VALUE')

plt.tight_layout()
plt.show()

![Data%20Viz%209.png](attachment:Data%20Viz%209.png)

#### Line Graph

In [None]:
bike_df = pd.read_csv('bicycle_by_day.csv', squeeze=True)

In [None]:
# Show the first five rows of the bicycle data

In [None]:
# Show the descriptive statistics for the numeric variables

In [None]:
bike_df['Date'] = pd.to_datetime(bike_df.dteday, format='%m/%d/%Y')

rental_ts = pd.Series(bike_df.cnt.values, index=bike_df.Date)

In [None]:
rental_ts.plot(ylim=[0, 10000], 
               legend=False, 
               figsize=[6, 4], 
               color='darkorange')

plt.xlabel('Year')  # set x-axis label
plt.ylabel('Rentals')  # set y-axis label

plt.tight_layout()
plt.show()

![Data%20Viz%2010.png](attachment:Data%20Viz%2010.png)

#### Boxplots

In [None]:
ax = data_df.boxplot(column='TOTAL_VALUE', by=???)

ax.set_ylabel('TOTAL_VALUE')

#plt.suptitle('')  # Suppress the titles
plt.title('')

plt.show()

![Data%20Viz%2011.png](attachment:Data%20Viz%2011.png)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (10, 6))

data_df.boxplot(column='TOTAL_VALUE', by=???, ax=axes[0])

data_df.boxplot(column=???, by='REMODEL', ax=axes[1])

data_df.boxplot(column=???, by=???, ax=axes[???])

for ax in axes:
    ax.set_xlabel('REMODEL')
    
plt.suptitle('')  # Suppress the overall title
plt.tight_layout()  # Increase the separation between the plots

plt.show()

![Data%20Viz%2012.png](attachment:Data%20Viz%2012.png)

#### Violin Plots

In [None]:
plt.figure(figsize=(10,6))

sns.violinplot(x=???, y=???, data=data_df)

![Data%20Viz%2013.png](attachment:Data%20Viz%2013.png)

#### Histogram with KDE

In [None]:
fig, ax = plt.subplots(1,1)

sns.histplot(???.???, 
             bins=???,
             kde=True,
             label='TOTAL_VALUE', 
             color='mediumblue')

ax.set_title('Total Value Distribution', fontsize=20)
ax.set(xlabel='Total Value', ylabel='count')

![Data%20Viz%2014.png](attachment:Data%20Viz%2014.png)

#### Color Coded Correlation Chart with numeric values overlay

In [None]:
# color-coded heatmap with correlation values

corr = data_df.???

fig, ax = plt.subplots()

fig.set_size_inches(11, 7)

sns.heatmap(???, annot=True, fmt=".1f", cmap="RdBu", center=0, ax=ax)

plt.show()

![Data%20Viz%2015.png](attachment:Data%20Viz%2015.png)

#### Missing Value Analysis

In [None]:
# This shows missing values in a list format

pd.DataFrame({'miss.val': data_df.isnull().sum(),
             })

![Data%20Viz%2016.png](attachment:Data%20Viz%2016.png)

In [None]:
# This shows missing values in a bar chart format

msno.???(???, color='deepskyblue')

![Data%20Viz%2017.png](attachment:Data%20Viz%2017.png)

In [None]:
# This shows the nullity correlations

msno.???(???)

![Data%20Viz%2018.png](attachment:Data%20Viz%2018.png)

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

filename = ""
separator = ","

dft = AV.AutoViz(
    filename,
    sep = separator,
    depVar="TOTAL_VALUE",
    dfte = data_df,
    header=0,
    verbose=0,
    lowess=False,
    chart_format="png",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
)