In [4]:
# 1 of k 编码
# 特征值为programmer的特征提取
from pyspark import SparkContext
import numpy as np
sc.stop()
sc = SparkContext("local")
user_data = sc.textFile("file:///E:/Code/python_spark/six_cases"
                        "/movie_lens/data/u.user")
user_fields = user_data.map(lambda x:x.split("|"))
all_occupations = user_fields.map(lambda x:x[3]).distinct().collect()

all_occupations.sort()

idx=0
all_occupations_dict = {}
for occupation in all_occupations:
    all_occupations_dict[occupation]=idx
    idx=idx+1
K = len(all_occupations_dict)
binary_k = np.zeros(K)
programmer = all_occupations_dict["programmer"]
binary_k[programmer] = 1
print("程序员的1 of k编码为: %s" % binary_k)

程序员的1 of k编码为: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


In [7]:
# 派生特征提取
rating_data_raw = sc.textFile("file:///E:/Code/python_spark/six_cases"
                        "/movie_lens/data/u.data")
# 获取评分RDD
rating_data = rating_data_raw.map(lambda line: line.split("\t"))
ratings = rating_data.map(lambda fields: int(fields[2]))
def extract_datatime(x):
    import datetime
    return datetime.datetime.fromtimestamp(x)
timestamps = rating_data.map(lambda x:int(x[3]))
hour_of_day = timestamps.map(lambda x:extract_datatime(x).hour)

def assign_tod(hr):
    times_of_day = {
                'morning' : range(7, 12),
                'lunch' : range(12, 14),
                'afternoon' : range(14, 18),
                'evening' : range(18, 23),
                'night' : range(23, 7)
                }
    for k, v in times_of_day.items():
        if hr in v: 
            return k
 
# 获取新的分类变量RDD
time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))
print(time_of_day.take(5))

[None, None, 'afternoon', 'lunch', 'lunch']


In [9]:
# 文本特征提取
movie_data = sc.textFile("file:///E:/Code/python_spark/six_cases"
                        "/movie_lens/data/u.item")
movie_fields = movie_data.map(lambda lines: lines.split("|"))
 
# 函数: 剔除掉标题中的(年份)部分
def extract_title(raw):
    import re
    grps = re.search("\((\w+)\)", raw)
    if grps:
        return raw[:grps.start()].strip()
    else:
        return raw
 
# 获取影片名RDD
raw_titles = movie_fields.map(lambda fields: fields[1])
 
# 剔除影片名中的(年份)
movie_titles = raw_titles.map(lambda m: extract_title(m))
title_terms = movie_titles.map(lambda t: t.split(" "))
 
# 搜集所有的词
all_terms = title_terms.flatMap(lambda x: x).distinct().collect()

idx = 0
all_terms_dict = {}
for term in all_terms:
    all_terms_dict[term] = idx
    idx +=1
num_terms = len(all_terms_dict)
 
# 函数: 采用稀疏向量格式保存编码后的特征并返回
def create_vector(terms, term_dict):
    from scipy import sparse as sp
    x = sp.csc_matrix((1, num_terms))
    for t in terms:
        if t in term_dict:
            idx = term_dict[t]
            x[0, idx] = 1
    return x
 
# 将字典保存为广播数据格式类型。因为各个worker都要用
all_terms_bcast = sc.broadcast(all_terms_dict)
# 采用稀疏矩阵格式保存影片名特征
term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))
# 展示提取结果
print(term_vectors.take(5))

[<1x2645 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Column format>, <1x2645 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Column format>, <1x2645 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Column format>, <1x2645 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Column format>, <1x2645 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Column format>]


In [11]:
# 归一化特征
# 导入Spark库中的正则化类
from pyspark.mllib.feature import Normalizer
# 初始化正则化对象
normalizer = Normalizer()
np.random.seed(42)
x = np.random.randn(10)
vector = sc.parallelize([x])
normalized_x_mllib = normalizer.transform(vector).first().toArray()
  
# 结果展示
print("向量x:\n%s" % x)
print("被MLlib归一化后的向量x:\n%s" % normalized_x_mllib)
print("被MLlib归一化后的向量x的二阶范数: %2.4f" % 
      np.linalg.norm(normalized_x_mllib))

向量x:
[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
  1.57921282  0.76743473 -0.46947439  0.54256004]
被MLlib归一化后的向量x:
[ 0.19172213 -0.05336737  0.24999534  0.58786029 -0.09037871 -0.09037237
  0.60954584  0.29621508 -0.1812081   0.20941776]
被MLlib归一化后的向量x的二阶范数: 1.0000
