# Data Science


In [None]:
# Clear all variable values previously set
# from IPython import get_ipython
# get_ipython().magic('reset -sf')

## Imports


In [None]:
# Provides ways to work with large multidimensional arrays
import numpy as np

# Allows for further data manipulation and analysis
import pandas as pd

# In Anaconda -> Environments -> Not Installed -> pandas-datareader -> Apply
from pandas_datareader import data as web # Reads stock data 
import matplotlib.pyplot as plt # Plotting
import matplotlib.dates as mdates # Styling dates
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

## Constants


In [None]:
# Define path to files

# For MacOS or Linux
PATH = '/home/ali/Projects/Network/notebook/python4finance/data/'

# For Windows
# PATH = 'D:/python4finance/'

In [None]:
# Make an Numpy array
l1 = [1, 2, 3, 4, 5]
npa1 = np.array(l1)
npa1

## Numpy Crash Course


NumPy is an amazing scientific computing library that is used by numerous other Python Data Science libraries. It contains many mathematical, array and string functions that are extremely useful. Along with all the basic math functions you'll also find them for Linear Algebra, Statistics, Simulation, etc.

NumPy utilizes vector (1D Arrays) and matrice arrays (2D Arrays).


## Create Array in Range


In [None]:
# Creates array from 0 to 4
npa2 = np.arange(0, 5)
npa2

In [None]:
# Define a step
npa3 = np.arange(0, 10, 2)
npa3

## Matrices


In [None]:
# Create a 4 row matrix with 3 columns with all having a value of 1
mat1 = np.ones((4, 3))
mat1

In [None]:
# Create a 4 row matrix with 3 columns with all having a value of 0
mat2 = np.zeros((4, 3))
mat2

## Random Matrices


In [None]:
# Random values between 0 and 50 as a matrix with 4 rows & 3 columns
mat3 = np.random.randint(0, 50, (4, 3))
mat3

## Generate Defined Number of Values in Range


In [None]:
# Generate 10 equally distanced values between 1 and 10
mat4 = np.linspace(1, 10, 10)
mat4

## Reshape Arrays


In [None]:
# Create array with 12 values
mat6 = np.random.randint(0, 50, 12)
mat6

In [None]:
# Reshape to a 3 row 4 column array
mat6 = mat6.reshape(3, 4)
mat6

In [None]:
# Reshape into a 3D array with 3 blocks, 2 rows, 2 columns
mat7 = mat6.reshape(3, 2, 2)
mat7

In [None]:
# Reshape into a 3D array with 2 blocks, 3 rows, 2 columns
mat8 = mat6.reshape(2, 3, 2)
mat8

In [None]:
# Get the value in the 2nd block, 3rd row and 1st column
mat8[1, 2, 0]


## Filter Array


In [None]:
# Provide a boolean array where values are above 20
print(mat6)
mat6 > 20

In [None]:
# Return an array with just values over 20
mat6[mat6 > 20]


## Statistics Operations


In [None]:
# Generate 50 random values between 0 and 100
mat5 = np.random.randint(0, 100, 50)
mat5

In [None]:
print('Mean :', mat5.mean())
print('Standard Deviation :', mat5.std())
print('Variance :', mat5.var())
print('Min :', mat5.min())
print('Max :', mat5.max())

## The Seed Function


In [None]:
# Used when you want to replicate randomization
np.random.seed(500)
mat9 = np.random.randint(0, 50, 10)
mat9

In [None]:
# Everything goes back to random on the next call
mat10 = np.random.randint(0, 50, 10)
mat10

# Pandas


Pandas provides numerous tools to work with tabular data like you'd find in spreadsheets or databases. It is widely used for data preparation, cleaning, and analysis. It can work with a wide variety of data and provides many visualization options. It is built on top of NumPy.


## Read Data from a CSV


In [None]:
def get_df_from_csv(ticker):
    try:
        df = pd.read_csv(
            PATH + ticker + '.csv', index_col='Date', parse_dates=True
        )
    except FileNotFoundError:
        print('File Doesn\'t Exist')
        pass
    else:
        return df

In [None]:
msft_df = get_df_from_csv('MSFT')
msft_df

## Read Data from Excel


In [None]:
def get_df_from_excel(file):
    try:
        df = pd.read_excel(file)
    except FileNotFoundError:
        pass
        print('File Doesn\'t Exist')
    else:
        return df

In [None]:
# You may have to run this in the Qt Console : pip install openpyxl
file = PATH + 'stock_sectors.xlsx'
# file = PATH + 'Wilshire-5000-Stocks.xlsx'
w_stocks = get_df_from_excel(file)
w_stocks

## Read Data from HTML


In [None]:
g_data = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_current_United_States_governors'
)
g_data

In [None]:
# We can define that we want the 2nd table on the page
g_data = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_current_United_States_governors'
)[1]
g_data

## US Demographics


In [None]:
# You can also search for phrases in the table
d_data = pd.read_html(
    'https://en.wikipedia.org/wiki/Demographics_of_the_United_States',
    match='Average population'
)[0]
d_data

## Replace Spaces in Column Names


In [None]:
d_data.columns = [x.replace(' ', '_') for x in d_data.columns]
d_data

## Remove Characters in Columns


In [None]:
# Remove parentheses and whats inside them
d_data.columns = d_data.columns.str.replace(r'\(.*\)', '')
# Remove brackets and whats inside them
d_data.columns = d_data.columns.str.replace(r'\[.*\]', '')
d_data

## Rename Columns


In [None]:
# You could add additional with commas between {}
d_data = d_data.rename(columns={'Unnamed:_0': 'Year'})
d_data

## Remove Characters in Columns


In [None]:
# Removes brackets and what is inside for whole column
d_data.Year = d_data.Year.str.replace(r'\[.*\]', '')
d_data

## Select Columns


In [None]:
d_data.Live_births

In [None]:
d_data['Deaths']

## Make a Column an Index


In [None]:
d_data.set_index('Year', inplace=True)
d_data

## Grab Data from Multiple Columns


In [None]:
d_data[['Live_births', 'Deaths']]

## Grab a Row


In [None]:
d_data.loc['2020']

In [None]:
d_data.iloc[85]

## Add a Column


In [None]:
# Create a column showing population growth for each year
d_data['Pop_Growth'] = d_data['Live_births'] - d_data['Deaths']
d_data

## Delete Column


In [None]:
d_data.drop('Pop_Growth', axis=1, inplace=True)
d_data

## Delete a Row


In [None]:
d_data.drop('1935', axis=0, inplace=True)
d_data

## Manipulating Data


In [None]:
c_data = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
)[2]
c_data

In [None]:
for col in c_data.columns:
    print(col)

In [None]:
# Delete a level of a multilevel column name
c_data.columns = c_data.columns.droplevel()
c_data


In [None]:
# Keep only columns if they haven't used the same name prior
c_data = c_data.loc[:, ~c_data.columns.duplicated()]
c_data

In [None]:
# Delete any rows with NaN values by taking only rows that don't contain NaNs
c_data = c_data[c_data['Estimate'].notna()]
c_data


In [None]:
# Remove []s and what is in them in Year column
# Removes brackets and what is inside for whole column
c_data.Year = c_data.Year.str.replace(r'\[.*\]', '')
c_data

In [None]:
# Rename country column
c_data.rename(
    columns={
        'Country/Territory': 'Country',
        'Estimate': 'GDP'
    }, inplace=True
)
c_data

In [None]:
# Remove * in Country column
c_data.Country = c_data.Country.str.replace('*', '')
c_data

In [None]:
# Groupby allows you to group rows based on a column and perform a function
# Mean GDP by region
c_data.groupby('Region').mean()

In [None]:
# Median GDP by region
c_data.groupby('Region').median()

## More Ways of Messing with Data


In [None]:
# Dictionary with ice cream sales data
dict1 = {
    'Store': [1, 2, 1, 2],
    'Flavor': ['Choc', 'Van', 'Straw', 'Choc'],
    'Sales': [26, 12, 18, 22]
}

# Convert to Dataframe
ic_data = pd.DataFrame(dict1)
print(ic_data)

# Group data by the store number
by_store = ic_data.groupby('Store')

# Get mean sales by store
print(by_store.mean())

# Get sales total just for store 1
print(by_store.sum().loc[1])

# You can use multiple functions of get a bunch
by_store.describe()

## Plotly


Plotly allows you to create over 40 beautiful interactive web-based visualizations that can be displayed in Jupyter notebooks or saved to HTML files. It is widely used to plot scientific, statistical and financial data.


You can install using Anaconda under the environment tab by searching for Plotly. You'll also need Cufflinks and a few other packages that you can install by running : conda install -c conda-forge cufflinks-py in your command line or terminal. Also you can use the commands pip install plotly and pip install cufflinks. Cufflinks connects Plotly to Pandas.


In [None]:
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

## Line Plot


In [None]:
# Plot the value of a dollar invested over time
# Use included Google price data to make one plot
df_stocks = px.data.stocks()
px.line(
    df_stocks,
    x='date',
    y='GOOG',
    labels={
        'x': 'Date',
        'y': 'Value of Dollar'
    }
)


In [None]:
# Make multiple line plots
px.line(
    df_stocks,
    x='date',
    y=['GOOG', 'AAPL'],
    labels={
        'x': 'Date',
        'y': 'Value of Dollar'
    },
    title='Apple vs. Google'
)

In [None]:
# Multiple plots
r_x1 = np.linspace(0, 1, 100)
r_y0 = np.random.randn(100) + 5
r_y1 = np.random.randn(100) - 5
r_y2 = np.random.randn(100)

fig = go.Figure()
# There are many line styles
fig.add_trace(go.Scatter(x=r_x1, y=r_y0, mode='lines', name='Rand 1'))
fig.add_trace(go.Scatter(x=r_x1, y=r_y1, mode='lines+markers', name='Rand 2'))
fig.add_trace(go.Scatter(x=r_x1, y=r_y2, mode='markers', name='Rand 3'))

fig.show()

## Add Details to Plot


In [None]:
aapl_df = pd.read_csv('data/AAPL.csv')
x = aapl_df['Date']
y = aapl_df['Adj Close']
aapl_df

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y))
fig.update_xaxes(rangeslider_visible=True, title='Zoom on Dates Using Slider')
fig.update_yaxes(title='Stock Price (USD)')

## More Details


In [None]:
msft_df = pd.read_csv('data/MSFT.csv')
x = msft_df['Date']
y = msft_df['Adj Close']
msft_df
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y))

# Add a range slider with buttons for dates
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list(
                [
                    dict(
                        count=10, label='10D', step='day', stepmode='backward'
                    ),
                    dict(
                        count=1, label='1M', step='month', stepmode='backward'
                    ),
                    dict(count=6, label='6M', step='month', stepmode='todate'),
                    dict(count=1, label='YTD', step='year', stepmode='todate'),
                    dict(
                        count=1, label='1Y', step='year', stepmode='backward'
                    ),
                    dict(label='All', step='all')
                ]
            )
        ),
        rangeslider=dict(visible=True),
        type='date'
    )
)
fig.update_yaxes(title='Stock Price (USD)')

## Candlestick Plots


In [None]:
# Candlestick charts are useful because they show the open and close in the
# wide part 'Real Body' and the high and low using the Shadows or Wicks.
# They can be used to sense emotion in a stock in the near term.
x = aapl_df['Date']
close = aapl_df['Adj Close']
high = aapl_df['High']
low = aapl_df['Low']
openp = aapl_df['Open']

fig = go.Figure()

fig.add_trace(go.Candlestick(x=x, high=high, low=low, open=openp, close=close))

fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list(
                [
                    dict(
                        count=10, label='10D', step='day', stepmode='backward'
                    ),
                    dict(
                        count=1, label='1M', step='month', stepmode='backward'
                    ),
                    dict(count=6, label='6M', step='month', stepmode='todate'),
                    dict(count=1, label='YTD', step='year', stepmode='todate'),
                    dict(
                        count=1, label='1Y', step='year', stepmode='backward'
                    ),
                    dict(label='All', step='all')
                ]
            )
        ),
        rangeslider=dict(visible=True),
        type='date'
    )
)
fig.update_layout(
    xaxis_title='Dates',
    yaxis_title='Stock Price (USD)',
    title='Apple Candlestick Chart'
)

# Add an annotation
fig.update_layout(
    annotations=[
        dict(
            x='2020-02-21',
            y=85,
            text='Pandemic Effects Market',
            xanchor='right'
        )
    ]
)
# Add line defining an area
fig.update_layout(shapes=[dict(x0='2020-02-21', x1='2020-04-06', y0=0, y1=1)])
# Add a rectangle
# fig.add_shape(type='rect',
#     x0='2020-02-21', y0=0, x1='2020-04-06', y1=145,
#     line=dict(
#         color='RoyalBlue',
#         width=2,
#     ),
#     fillcolor='LightSkyBlue',
# )

# Add a vertical rectangle with opacity
fig.add_vrect(
    x0='2020-02-21',
    x1='2020-04-06',
    fillcolor='LightSalmon',
    opacity=0.5,
    layer='below',
    line_width=0,
)

fig.show()

## Open-High-Low-Close (OHLC) Chart


In [None]:
# While a candlestick chart has a rectangle representing the open and close
# An OHLC chart shows all 4 data points using ticks and when the opening
# and closing prices are fall apart that shows momentum up or down
# The tick on the left is the opening price and the left is the close
fig = go.Figure()

fig.add_trace(go.Ohlc(x=x, high=high, low=low, open=openp, close=close))

fig.show()

## Multiple Plots


In [None]:
fig = go.Figure()

fig.add_trace(go.Ohlc(x=x, high=high, low=low, open=openp, close=close))

a_x = aapl_df['Date']
a_y = aapl_df['Adj Close']
fig.add_trace(
    go.Scatter(x=a_x, y=a_y, line=dict(color='blue', width=1.5, dash='dot'))
)

fig.show()

## Scatter Plots


In [None]:
x = np.arange(0, 50, 2)
y = np.arange(0, 50, 2)

# Create a figure to which I'll add plots
fig = go.Figure()

# Markers just shows the dots
fig.add_trace(go.Scatter(x=x, y=y, mode='markers'))

fig.show()

In [None]:
# Use included Iris data set
df_iris = px.data.iris()
# Create a scatter plot by defining x, y, different color for count of provided
# column, size based on supplied column and additional data to display on hover
px.scatter(
    df_iris,
    x='sepal_width',
    y='sepal_length',
    color='species',
    size='petal_length',
    hover_data=['petal_width']
)

# Create a customized scatter with black marker edges with line width 2, opaque
# and colored based on width. Also show a scale on the right
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_iris.sepal_width,
        y=df_iris.sepal_length,
        mode='markers',
        marker_color=df_iris.sepal_width,
        text=df_iris.species,
        marker=dict(showscale=True)
    )
)
fig.update_traces(marker_line_width=2, marker_size=10)

# Working with a lot of data use Scattergl
fig = go.Figure(
    data=go.Scattergl(
        x=np.random.randn(100000),
        y=np.random.randn(100000),
        mode='markers',
        marker=dict(
            color=np.random.randn(100000), colorscale='Viridis', line_width=1
        )
    )
)
fig

## Histogram


In [None]:
# Plot histogram based on rolling 2 dice
dice_1 = np.random.randint(1, 7, 5000)
dice_2 = np.random.randint(1, 7, 5000)
dice_sum = dice_1 + dice_2
# bins represent the number of bars to make
# Can define x label, color, title
# marginal creates another plot (violin, box, rug)
fig = px.histogram(
    dice_sum,
    nbins=11,
    labels={'value': 'Dice Roll'},
    title='5000 Dice Roll Histogram',
    marginal='violin',
    color_discrete_sequence=['green']
)

fig.update_layout(
    xaxis_title_text='Dice Roll',
    yaxis_title_text='Dice Sum',
    bargap=0.2,
    showlegend=False
)

# Stack histograms based on different column data
df_tips = px.data.tips()
px.histogram(df_tips, x='total_bill', color='sex')