# Python for scientific computing ecosystem
    
__[Most of this talk is based on these lectures ](https://scipy-lectures.org/_downloads/ScipyLectures-simple.pdf)__


In [None]:
my_list = ['a', 1, 3, 'name']
print(my_list)
print(len(my_list))

In [None]:
my_list.append(5)
print(my_list)

In [None]:
my_list + ['fdf', 77]

In [None]:
my_list

In [None]:
my_list[:2] # slicing is uniform over all data structures

# Functions abstraction

In [None]:
def accumulate(n):    # write accumulate function
    result = 0
    a = 0
    while a < n:
        result = result + a
        print(a, result)
        a = a + 1
    return result
# run the function
accumulate(6)

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
a = np.array([0, 1, 2, 3])

print(a)
print(f'Shape of the numpy array is: {a.shape}')
print(f'Number of dimesions: {a.ndim}')

In [None]:
a = np.arange(10)
print(a)
print(a.ndim)
print(a.shape)

In [None]:
a*a

In [None]:
a**2

In [None]:
start_x = 0
end_x = 10
n_points = 11
a = np.linspace(start_x, end_x, n_points)


In [None]:
plt.plot(a, np.cos(a), '--')
plt.show()

In [None]:
"""
np.random.randn(d0, d1, ..., dn)

Return a sample (or samples) from the "standard normal" distribution.

.. note::
    This is a convenience function for users porting code from Matlab,
    and wraps `standard_normal`. That function takes a
    tuple to specify the size of the output, which is consistent with
    other NumPy functions like `numpy.zeros` and `numpy.ones`.
"""
np.random.randn?

In [None]:
np.random.randn(3,3)

In [None]:
n_points = 100
mean_value = 300
xs = np.linspace(0, 10, n_points)
ys = mean_value * xs + 1000 * np.cos(xs) + 300 * np.random.randn(n_points)

plt.plot(xs, ys, '--')
plt.fill_between(xs, ys, mean_value, where= (xs > 4), facecolor='g', alpha=0.3)
plt.fill_between(xs, ys, mean_value, where= (xs <= 4), facecolor='y', alpha=0.7)

plt.title("Sample Visualization")
plt.show()

# Loading data from files is important

In [None]:
import numpy as np
population_str = """
# year	hare	lynx	carrot
1900	30e3	4e3	48300
1901	47.2e3	6.1e3	48200
1902	70.2e3	9.8e3	41500
1903	77.4e3	35.2e3	38200
1904	36.3e3	59.4e3	40600
1905	20.6e3	41.7e3	39800
1906	18.1e3	19e3	38600
1907	21.4e3	13e3	42300
1908	22e3	8.3e3	44500
1909	25.4e3	9.1e3	42100
1910	27.1e3	7.4e3	46000
1911	40.3e3	8e3	46800
1912	57e3	12.3e3	43800
1913	76.6e3	19.5e3	40900
1914	52.3e3	45.7e3	39400
1915	19.5e3	51.1e3	39000
1916	11.2e3	29.7e3	36700
1917	7.6e3	15.8e3	41800
1918	14.6e3	9.7e3	43300
1919	16.2e3	10.1e3	41300
1920	24.7e3	8.6e3	47300"""

# the following is a hack around to read a string as if it is a file
import io
population_txt = io.StringIO(population_str)
data_np = np.loadtxt(population_txt, comments='#', skiprows=0) 
# data_np = loadtxt('data/populations.txt')

print(f'current data: \n {data_np}')

In [None]:
data_np.T

In [None]:
# array data.T --> transpose
year, hares, lynxes, carrots = data_np.T # trick: columns to variables
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,7))
plt.plot(year, hares, year, lynxes, year, carrots)
plt.legend(('Hare', 'Lynx', 'Carrot'), loc=(.77, 0.75))

plt.xlim(year.min(), year.max())
plt.xticks(np.linspace(year.min(), year.max(), 5))

plt.ylim(0, 80_000)
plt.yticks(np.linspace(0, 80_000, 9))

plt.grid(True)
plt.show()

In [None]:
def dummy_math_func(x, y):
    return (1 - x / 2 + x ** 5 + y ** 3) * np.exp(-x ** 2 -y ** 2)

n = 256
x = np.linspace(-3, 3, n) 
y = np.linspace(-3, 3, n)

X, Y = np.meshgrid(x, y)
Z = dummy_math_func(X, Y)
fig = plt.figure(figsize=(10,10))
plt.contourf(X, Y, Z, 8, alpha=.75, cmap='jet')
C = plt.contour(X, Y, Z, 8, colors='black')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='hot')

In [None]:
population_txt.seek(0)  # reset the start of pointer to the string start
data_df = pd.read_csv(population_txt, sep='\t')
print(data_df)

In [None]:
print(data_df.columns)
data_df.columns = ['year', 'hare', 'lynx', 'carrot']
print(data_df.columns)

In [None]:
data_df.set_index('year', inplace=True)
print(data_df)

In [None]:
print(data_df.loc[1900])
print(data_df.hare)

In [None]:
data_df.hare.rolling(window=3).mean()

In [None]:
data_df.hare.plot()
data_df.hare.rolling(window=2).mean().plot()

In [None]:
data_df.describe()

In [None]:
import seaborn as sns
ax = sns.pairplot(data_df, vars=['hare', 'lynx', 'carrot'], kind='reg')
# pd.plotting.scatter_matrix(data_df[['hare', 'lynx', 'carrot']], figsize=(10, 10))


In [None]:
g = sns.pairplot(data_df, diag_kind="kde")
g.map_lower(sns.kdeplot, levels=3, color=".2")

In [None]:
import pandas as pd
olympics_df = pd.read_csv("https://raw.githubusercontent.com/mojones/binders/master/olympics.csv", sep="\t")
print(olympics_df)
olympics_df['Country'].value_counts().head(30)
olympics_df['Country'].value_counts().head(30).plot(kind='barh', figsize=(20,10))

In [None]:
from sklearn.linear_model import LinearRegression

def math_func(x):
    return 0.5 * x + 1.0 + 3*np.random.normal(size=x.shape)

x_max = 100

x = x_max * np.random.random((20, 1)) 

# y = a*x + b with noise
y = math_func(x)

# create a linear regression model
model = LinearRegression()
model.fit(x, y)

# predict y from the data
x_new = np.linspace(0, x_max, 50)
y_true = math_func(x_new)
y_pred = model.predict(x_new[:, None])


fig = plt.figure(figsize=(12,12))
plt.plot(x,y, '*', c='b')
plt.plot(x_new,y_true, 'x', c='r')
plt.plot(x_new,y_pred, '-o', c='k')
# plt.xlim(0, 100)
plt.grid(True)
plt.show()


fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)

ax.scatter(y_true, y_pred)
plt.xlabel('reference')
plt.ylabel('prediction')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, max_error
max_e = max_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
print(mse, max_e)


In [None]:
y_true = y_true.reshape(-1, 1) # (n_samples, n_outputs)
print(y_true.shape)
np_mse = np.mean((y_true.reshape(-1, 1) - y_pred) ** 2)
print(f"Mean square error: {np_mse}")

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
model = make_pipeline(PolynomialFeatures(degree=7), LinearRegression())


model.fit(x, y)
# predict y from the data
x_new = np.linspace(0, x_max, 50)
y_true = math_func(x_new)
y_pred = model.predict(x_new[:, None])


fig = plt.figure(figsize=(12,12))
plt.plot(x,y, '*', c='b')
plt.plot(x_new,y_true, 'x', c='r')
plt.plot(x_new,y_pred, '-o', c='k')
plt.grid(True)
plt.show()


fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)

ax.scatter(y_true, y_pred)
plt.xlabel('reference')
plt.ylabel('prediction')
plt.grid(True)
plt.show()

In [None]:
max_e = max_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
print(mse, max_e)


In [None]:
# example of clustering
centers = [[1, 1], [-1, -1], [1, -1]]
n_points = 100
data_list = []
label_list = []
n_clusters = len(centers)
for idx_, center_ in enumerate(centers):
    # print(idx_, center_)
    data_list.append(center_ + 0.2*np.random.randn(n_points, 2))
    label_list.append(n_points*[idx_])
    
data_np = np.concatenate(data_list)
labels_np = np.concatenate(label_list)

color_list = ['k', 'g', 'b']
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

for c_ in range(n_clusters):
    c_filter = (labels_np == c_)
    ax.scatter(data_np[c_filter, 0], data_np[c_filter, 1], 
               color=color_list[c_], marker='*')
    
plt.show()

In [None]:
import time

from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(data_np)
t_batch = time.time() - t0

In [None]:
print(k_means.labels_)
print(k_means.cluster_centers_)
