In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')  # 按CPU个数的本地运行模式

## 只读广播变量

In [4]:
ohe = {'M': (1, 0, 0), 'F': (0, 1, 0), 'U': (0, 0, 1)}  # 变量
broadcast_map = sc.broadcast(ohe)  # 广播变量
rdd2 = sc.parallelize(['M', 'F', 'U', 'F', 'M', 'U'])


def ohe_map_func(x, shared_map):
    return shared_map[x]


rdd2.map(
    lambda x: ohe_map_func(x, broadcast_map.value)).collect()  # value属性为访问广播变量

[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]

In [5]:
broadcast_map.unpersist()

In [7]:
accum = sc.accumulator(0)


def split_line(line):
    if len(line) == 0:
        accum.add(1)  # 空则累加1
    return 1  # 一行则有1


rdd3 = sc.textFile(
    'file:///D:/Projects/python_projects/big_data/shakespeare_all.txt')
tot_lines = rdd3.map(split_line).count()

In [8]:
empty_lines = accum.value

In [9]:
print(tot_lines)

169442


In [10]:
print(empty_lines)

28854


In [22]:
from sklearn.datasets import load_iris
bcast_dataset = sc.broadcast(load_iris())

In [14]:
from pyspark import AccumulatorParam


class ErrorAccumulator(AccumulatorParam):

    def zero(self, initialList):  # 初始化方法
        return initialList

    def addInPlace(self, v1, v2):  # add方法，组合一个元组和一个列表，但不分先后
        if not isinstance(v1, list):
            v1 = [v1]
        if not isinstance(v2, list):
            v2 = [v2]
        return v1 + v2


errAccum = sc.accumulator([], ErrorAccumulator())

In [28]:
def apply_classifier(clf, dataset):
    clf_name = clf.__class__.__name__
    X = dataset.value.data
    y = dataset.value.target
    try:
        from sklearn.metrics import accuracy_score
        clf.fit(X, y)
        y_pred = clf.predict(X)
        acc = accuracy_score(y, y_pred)
        return [(clf_name, acc)]
    except Exception as e:
        errAccum.add((clf_name, str(e)))
        return []

In [29]:
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import MDS

classifiers=[
    DummyClassifier(strategy='most_frequent'),
    SGDClassifier(),
    PCA(),
    MDS()
]

print(sc.parallelize(classifiers).flatMap(lambda clf: apply_classifier(clf, bcast_dataset)).collect())

[('DummyClassifier', 0.3333333333333333), ('SGDClassifier', 0.92)]


In [30]:
print('The errors are:', errAccum.value)

The errors are: [('MDS', "'MDS' object has no attribute 'predict'"), ('PCA', "'PCA' object has no attribute 'predict'")]


In [27]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.__class__.__name__

'LogisticRegression'