# Data Type of MLLib

In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [2]:
# 启动 Spark （如果你已经启动就不需要）
spark = (SparkSession.builder.master("local[2]")
         .appName("test")
         .getOrCreate()) 
sc = spark.sparkContext

## import Library

In [3]:
import numpy as np
from pyspark.mllib.linalg import Vectors

## Dense vetcor

In [4]:
#使用list
x = [1,2,3,4,5]
dense_x = Vectors.dense(x)
print("dense_x = " + str(dense_x))

dense_x = [1.0,2.0,3.0,4.0,5.0]


In [5]:
type(dense_x)

pyspark.mllib.linalg.DenseVector

## Spars vector

In [None]:
[1,0,0,0,3] => 5, {0:1, 4:3}

In [9]:
#三种产生Sparse vector的方法

sparse_x = Vectors.sparse(5, {1: 1.0, 3: 5.5})
print("sparse_x = " + str(sparse_x))

sparse_y = Vectors.sparse(5, [(1, 1.0), (3, 5.5)])
print("sparse_y = " + str(sparse_y))

sparse_z = Vectors.sparse(5, [1, 3], [1.0, 5.5])
print("sparse_z = " + str(sparse_z))

sparse_x = (5,[1,3],[1.0,5.5])
sparse_y = (5,[1,3],[1.0,5.5])
sparse_z = (5,[1,3],[1.0,5.5])


## 确认 Sparse vector

In [13]:
sparse_x.toArray()

array([ 0. ,  1. ,  0. ,  5.5,  0. ])

In [14]:
def print_sparse(x):
    for i in range(x.size):
        #当saprse vector最后一位遇到缺值会因为省略而出现Index Error
        try:
            print(x[i])
        except IndexError: 
            print(0.0)

        
print_sparse(sparse_x)

0.0
1.0
0.0
5.5
0.0


In [17]:
sparse_x[2]

0.0

## Vector 方法

In [None]:
# Dot Product
# https://github.com/apache/spark/blob/12206058e8780e202c208b92774df3773eff36ae/python/pyspark/mllib/linalg/__init__.py

In [None]:
a = [1, 2]
b = [3, 4]
c = [1, 1, 1]
dense_a = Vectors.dense(a)
dense_b = Vectors.dense(b)
dense_c = Vectors.dense(c)

In [None]:
# 1 * 3 + 2 * 4
dense_a.dot(dense_b)

In [None]:
dense_a.dot(dense_c)

In [None]:
# 平方和
dense_a.dot(dense_a)

In [None]:
# dense 和 sparse vector可以一起进行运算(注意维度要相同)
dense_x.dot(sparse_x)

In [None]:
# 计算距离
dense_x.squared_distance(sparse_y)

## DenseVector = numpy.ndarray

In [None]:
type(dense_x)

In [None]:
dense_x.reduce(lambda x, y : x + y)

In [None]:
## 要透过 spark context 转成RDD
sc.parallelize(dense_x).reduce(lambda x, y : x + y)

In [None]:
# sparse vector 也要透过 spark context 转成RDD
sc.parallelize(sparse_x).reduce(lambda x, y : x + y)

In [None]:
sc.parallelize(dense_x).sum()

In [None]:
sc.parallelize(sparse_x).sum()

## 将vector 以 Row为单位叠成 data set

In [None]:
data = [sparse_x, sparse_y, sparse_z]

In [None]:
data

In [None]:
# dataset 可以做统计计算
from pyspark.mllib.stat import Statistics
Statistics.colStats(sc.parallelize(data)).mean()

## Label Point

In [18]:
# 要注意 LabelPoint 和 Vector 来自不同的物件
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

### LabelPoint(label, [feature1, feature2, feature3])

In [19]:
data_label = [
     LabeledPoint(0.0, [0.0,1.0,1.0]),
     LabeledPoint(1.0, [1.0,1.0,2.0]),
     LabeledPoint(1.0, [2.0,3.0,2.0]),
     LabeledPoint(0.0, [3.0,2.0,5.0])
    ]

In [20]:
data_label

[LabeledPoint(0.0, [0.0,1.0,1.0]),
 LabeledPoint(1.0, [1.0,1.0,2.0]),
 LabeledPoint(1.0, [2.0,3.0,2.0]),
 LabeledPoint(0.0, [3.0,2.0,5.0])]

# Read Data From CSV File

In [21]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics


In [23]:
data = spark.read.csv("../data/ratings.csv", header= True)

In [24]:
data.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [25]:
sample_data = data.sample(False, 0.001, 1)

In [27]:
sample_data.count()

19976

In [29]:
type(sample_data)

pyspark.sql.dataframe.DataFrame

In [None]:
sample_data.rdd.map()_

In [None]:
dense_data = sample_data.rdd.map(lambda x: Vectors.dense(x))

In [None]:
dense_data.take(5)

In [None]:
Statistics.colStats(dense_data).mean()