In [None]:
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

def bucketize(point, bucket_size):
    return bucket_size * np.floor(point / bucket_size)

def make_hist(points, bucket_size):
    return Counter([bucketize(point, bucket_size) for point in points])

def plot_hist(points, bucket_size, title='Default Title'):
    bucketed_points = make_hist(points, bucket_size)
    plt.figure(figsize=(10, 8))
    plt.title(title)
    plt.bar(bucketed_points.keys(), bucketed_points.values())
    plt.show()
    
    


""" points = [24, 26, 33, 37, 12, 27, 39, 44, 31, 32, 33, 9, 1]

bucketed_points = make_hist(points, bucket_size=10)

plot_hist(bucketed_points) """

uniform = [np.random.uniform() * 200 - 100 for _ in range(10000)]

normal = np.random.normal(0, 57, 10000)

plot_hist(uniform, 10, 'Uniform Histogram')

plot_hist(normal, 10, 'Normal Histogram')




In [None]:
xs = np.random.normal(0, 20, 1000)

y_spread = 10

ys1 = np.random.normal(0, y_spread, 1000)
ys1 = [x + y for x, y in zip(xs, ys1)]


ys2 = np.random.normal(0, y_spread, 1000)
ys2 = [-x + y for x, y in zip(xs, ys2)]

# plot_hist(xs, 1, 'XS')

plt.figure(figsize=(10,8))

plt.scatter(xs, ys1, marker='.', color='blue', label='ys1')
plt.scatter(xs, ys2, marker='.', color='green', label='ys2')

plt.xlabel('xs')
plt.ylabel('ys')

plt.legend(loc=9)
plt.show()

print('Correlation between xs and ys1:', np.corrcoef(xs, ys1)[1][0])
print('Correlation between xs and ys2:', np.corrcoef(xs, ys2)[1][0])

In [None]:
# many dimensions
import seaborn as sns
import pandas as pd
sns.set()

df = pd.DataFrame()

df['xs'] = xs
df['ys1'] = ys1
df['ys2'] = ys2

sns.pairplot(df, height=3.5)
plt.show()


In [None]:
# cleaning and munging
""" def parse_row(input_row, parsers):
    return [parser(value) if parser is not None else value for value, parser in zip(input_row, parsers)] """

def parse_row(input_row, parsers):
    return [try_or_none(parser, value) if parser is not None else value
        for value, parser in zip(input_row, parsers)]

def parse_rows_with(reader, parsers):
    for row in reader:
        yield parse_row(row, parsers)

def try_or_none(f, x):
    try: return f(x)
    except: return None


import dateutil.parser
import csv

data = []

with open('./files/comma_sep_stock_prices.csv', 'rt') as f:
    reader = csv.reader(f)
    for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
        data.append(line)

print(data)

for row in data:
    if any(x is None for x in row):
        print(row)



In [None]:
def try_parse_field(field_name, value, parser_dict):
    parser = parser_dict.get(field_name)
    if parser is not None:
        return try_or_none(parser, value)
    else:
        return value

def parse_dict(input_dict, parser_dict):
    return {field_name: try_parse_field(field_name, value, parser_dict) for field_name, value in input_dict.items()}

In [None]:
# manipulating data

# get from a file
data = []

with open('./files/stocks.csv', 'r') as f:
    reader = csv.DictReader(f)
    for line in reader:
        data.append(line)

data[0]

In [None]:
from collections import defaultdict
max_appl_price = max(row["Close"] for row in data if row["Symbol"] == "AAPL")

# group rows by symbol
by_symbol = defaultdict(list)
for row in data:
    by_symbol[row["Symbol"]].append(row)

print('By symbol keys:', by_symbol.keys())

# use a dict comprehension to find the max for each symbol
max_price_by_symbol = {symbol: max(row["Close"] for row in grouped_rows) for symbol, grouped_rows in by_symbol.items()}
print('Max of each symbol:', max_price_by_symbol)

def picker(field_name):
    return lambda row: row[field_name]

def pluck(field_name, rows):
    return map(picker(field_name), rows)

def group_by(grouper, rows, value_transform=None):
    grouped = defaultdict(list)
    for row in rows:
        grouped[grouper(row)].append(row)
    if value_transform == None:
        return grouped
    else:
        return { key: value_transform(rows) for key, rows in grouped.items() }

max_price_by_symbol = group_by(picker("Symbol"), data, lambda rows: max(pluck("Close", rows)))
max_price_by_symbol

In [None]:
def percent_price_change(yesterday, today):
    return float(today["Close"]) / float(yesterday["Close"]) - 1

def day_over_day_changes(grouped_rows):
    ordered = sorted(grouped_rows, key=picker("Date"))
    return [{ "symbol": today["Symbol"],
    "date": today["Date"],
    "change": percent_price_change(yesterday, today) } 
    for yesterday, today in zip(ordered, ordered[1:])]

changes_by_symbol = group_by(picker("Symbol"), data, day_over_day_changes)
changes_by_symbol

all_changes = [change for changes in changes_by_symbol.values()
    for change in changes]

# print(all_changes)

print('max:', max(all_changes, key=picker("change")))
print('min:', min(all_changes, key=picker("change")))



In [None]:
# rescaling
import data_analysis_tools as da

a_to_b = da.euclidean([63, 150], [67, 160])
a_to_c = da.euclidean([63, 150], [70, 171])
b_to_c = da.euclidean([67, 160], [70, 171])

print('Inches', a_to_b, a_to_c, b_to_c)

a_to_b = da.euclidean([160, 150], [170.2, 160]) # 14.28
a_to_c = da.euclidean([160, 150], [177.8, 171]) # 27.53
b_to_c = da.euclidean([170.2, 160], [177.8, 171]) # 13.37



print('Centimeters', a_to_b, a_to_c, b_to_c)

In [None]:
def scale(data_matrix):
    _, num_cols = data_matrix.shape
    means = [da.mean(da.get_column(data_matrix, j)) for j in range(num_cols)]

    stdevs = [da.standard_deviation(da.get_column(data_matrix, j)) for j in range(num_cols)]

    return means, stdevs

def rescale(data_matrix):
    means, stdevs = scale(data_matrix)

    def rescaled(i, j):
        if stdevs[j] > 0:
            return (data_matrix[i][j] - means[j]) / stdevs[j]
        else:
            return data_matrix[i][j]

    num_rows, num_cols = data_matrix.shape
    return da.make_matrix(num_rows, num_cols, rescaled)

example_matrix = [[19, 2, 3, 2, 17], 
                  [19, 2, 3, 5, 2], 
                  [19, -2, 3, 5, -2]]


rescale(np.array(example_matrix))


In [None]:
# dimensionality reduction

def de_mean_matrix(A):
    nr, nc = A.shape
    column_means, _ = scale(A)

    def entry_fn(i, j):
        return A[i][j] - column_means[j]

    return da.make_matrix(nr, nc, entry_fn=entry_fn)

xs = da.random.normal(0, 10, 15)
ys = list(xi / 2 + int(da.random.normal(0, 4, 1)) for xi in xs)


plt.figure(figsize=(10, 8))
plt.scatter(xs, ys)
plt.show()

print(ys)


In [None]:
import data_analysis_tools as da
from functools import partial

vector = [3, 6, -4, -9]

def direction(w):
    mag = da.magnitude(w)
    return [wi / mag for wi in w]

def directional_variance_i(xi, w):
    return da.dot_product(xi, direction(w))

def directional_variance(x, w):
    return sum(directional_variance_i(xi, w) for xi in x)

print(f'direction of vector={vector} is {direction(vector)}')

def directional_variance_gradient_i(xi, w):
    projection_length = da.dot_product(xi, direction(w))
    return [2 * projection_length * xij for xij in xi]

def directional_variance_gradient(x, w):
    return da.vector_sum(directional_variance_gradient_i(xi, w) for xi in x)


def first_principal_component(x):
    guess = [1 for _ in x[0]]
    unscaled_maximizer = da.maximize_batch(partial(directional_variance, x),
                                                            partial(directional_variance_gradient, x),
                                                            guess)
    return direction(unscaled_maximizer)


def first_principal_component_sgd(x):
    guess = [1 for _ in x[0]]
    unscaled_maximized = da.maximize_stochastic(
        lambda x, _, w: directional_variance_i(x, w),
        lambda x, _, w: directional_variance_gradient_i(x, w),
        x,
        [0 for _ in x],
        guess)
    return direction(unscaled_maximized)

In [None]:
w = first_principal_component(list(zip(xs, ys)))
w

In [None]:
first_principal_component_sgd(list(zip(xs, ys)))

In [None]:
def project(v, w):
    projection_length = da.dot_product(v, w)
    return da.scalar_multiply(projection_length, w)

def remove_projection_from_vector(v, w):
    return da.vector_substract(v, project(v, w))

def remove_projection(x, w):
    return [remove_projection_from_vector(xi, w) for xi in x]


def principal_component_analysis(x, num_components):
    components = []
    for _ in range(num_components):
        component = first_principal_component(x)
        components.append(component)
        x = remove_projection(x, component)
    return components

In [None]:
components = principal_component_analysis(list(zip(xs, ys)), 4)
components

In [None]:
components = da.principal_component_analysis(list(zip(xs, ys)), 3)

In [None]:
components