# Python for mathematics, science and engineering
https://scipy.org/

## Scipy
(pronounced "Sigh Pie")

Higher level algorithms on top of `numpy`

* numerical integration
* optimization
* interpolation
* Signal Processing
* Linear Algebra
  * with sparse matrices
* statistics

In [None]:
import numpy, scipy
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg

%matplotlib inline
import matplotlib.pyplot

### Iris

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
print('Target names:', iris.target_names)
print('Features:', iris.feature_names)
print(iris.data)

first = iris.data[iris.target == 0]
second = iris.data[iris.target == 1]
third = iris.data[iris.target == 2]
print(len(first), len(second), len(third))

In [None]:
print("first average:", first.mean(axis=0))
print("second average:", second.mean(axis=0))
print("third average:", third.mean(axis=0))

In [None]:
print("sepal width and length: ", scipy.stats.pearsonr(iris.data[:, 0], iris.data[:, 1])[0])
print("petal width and length: ", scipy.stats.pearsonr(iris.data[:, 2], iris.data[:, 3])[0])
print("")
print("sepal width and length for first class: ", scipy.stats.pearsonr(first[:, 0], first[:, 1])[0])
print("sepal width and length for second class: ", scipy.stats.pearsonr(second[:, 0], second[:, 1])[0])
print("sepal width and length for third class: ", scipy.stats.pearsonr(third[:, 0], third[:, 1])[0])

A = 0.5*(numpy.diag(numpy.ones(7), k=1) - numpy.diag(numpy.ones(7), k=-1))
b = numpy.ones(len(A))

print('[A|b]:\n{}'.format(numpy.concatenate((A, b.reshape(-1,1)), axis=1)))

x = scipy.linalg.solve(A, b)### Sparse linalg

In [None]:
As = scipy.sparse.diags([-0.5*numpy.ones(7), 0.5*numpy.ones(7)], [-1,1])
bs = numpy.ones(8)

print('[As|bs]:\n{}'.format(numpy.concatenate((As.toarray(), bs.reshape(-1,1)), axis=1)))

xs = scipy.sparse.linalg.spsolve(As.tocsr(), bs)
print(xs)

## Document-term matrix decomposition

__Download [the file](http://sandbox.hlt.bme.hu/~gaebor/ea_anyag/python_nlp/movies.txt) and put it in the same folder, as your notebook!__

### Task 1.

In [None]:
movie_descriptions = {}
vocab = {}
with open("movies.txt", "rb") as f:
    for i, line in enumerate(f):
        title, description = line.strip().split(b'\t')
        movie_descriptions[title] = description.split(b' ')
        for word in set(movie_descriptions[title]):
            if word not in vocab:
                new_id = len(vocab)
                vocab[word] = new_id

In [None]:
print(len(vocab))
print(b" ".join(movie_descriptions[b"The Matrix"]))

In [None]:
movie_to_id = {k: i for i, k in enumerate(movie_descriptions.keys())}
id_to_movie = {i: k for k, i in movie_to_id.items()}
id_to_word = {i: w for w, i in vocab.items()}
print("The Matrix:", movie_to_id[b"The Matrix"])
print("0th movie:", id_to_movie[0])
print(len(movie_to_id)-1, "th movie:", id_to_movie[len(movie_to_id)-1])
print("word id of dog:", vocab[b"dog"])
print("0th word:", id_to_word[0])

### Task 2.

In [None]:
from collections import Counter

i = []
j = []
k = []

for title, description in movie_descriptions.items():
    words = Counter(description)
    for w, c in words.items():
        i.append(movie_to_id[title])
        j.append(vocab[w])
        k.append(c)
Matrix = scipy.sparse.csc_matrix((k, (i, j)), dtype="float32")
print(Matrix.shape)

### Task 3.

In [None]:
U, d, Vh = scipy.sparse.linalg.svds(Matrix, k=40, )
U /= numpy.sqrt((U**2).sum(1))[:, None]
Vh /= numpy.sqrt((Vh**2).sum(0))[None, :]
print(U.shape)
print(Vh.shape)

### Task 4.

In [None]:
def closests(v, k=1):
    return numpy.argpartition(((U - v[None, :])**2).sum(1), k, axis=0)[:k]

In [None]:
closests(numpy.ones(len(Vh)), 3)

Now you can search similar movies!

In [None]:
print([id_to_movie[i] for i in closests(U[movie_to_id[b"Monsters, Inc."]], 5)])
print([id_to_movie[i] for i in closests(U[movie_to_id[b"Popeye"]], 5)])

Or even mixture of movies by adding _"movie vectors"_!

In [None]:
[id_to_movie[i] for i in closests(U[movie_to_id[b"Popeye"]] + U[movie_to_id[b"Monsters, Inc."]], 10)]