In [4]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [5]:
spark = SparkSession\
     .builder\
     .appName("RandomForest")\
     .master("local[*]") \
     .enableHiveSupport()\
     .getOrCreate()
data=spark.sql("select * from  ml.adult")
#OneHotEncoder不能处理空字符串。所以我们需要将数据集中的空字符串提前处理一下
df=data.na.replace('','NA')
cols = df.columns #和pandas一样看列名
df.printSchema()

root
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: string (nullable = true)
 |-- capital_loss: string (nullable = true)
 |-- hours_per_week: string (nullable = true)
 |-- native_country: string (nullable = true)
 |-- salary: string (nullable = true)



In [6]:
#找到所有的string类型的变量
#dtypes用来看数据变量类型
cat_features = [item[0] for item in df.dtypes if item[1]=='string']
# 需要删除 salary列，否则标签泄露
cat_features.remove('salary')
#找到所有数字变量
num_features = [item[0] for item in df.dtypes if item[1]!='string']

In [7]:
stages = []
for col in cat_features:
    # 字符串转成索引
    string_index = StringIndexer(inputCol = col, outputCol = col + 'Index')
    # 转换为OneHot编码
    encoder = OneHotEncoder(inputCol=string_index.getOutputCol(), outputCol=col + "_one_hot")
    # 将每个字段的转换方式 放到stages中
    stages += [string_index, encoder]

In [8]:
# 将salary转换为索引
label_string_index = StringIndexer(inputCol = 'salary', outputCol = 'label')
# 添加到stages中
stages += [label_string_index]

In [9]:
# 类别变量 + 数值变量
assembler_cols = [c + "_one_hot" for c in cat_features] + num_features
assembler = VectorAssembler(inputCols=assembler_cols, outputCol="features")
stages += [assembler]


# 使用pipeline完成数据处理
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)
selected_cols = ["label", "features"] + cols
df = df.select(selected_cols)

In [10]:
pd.DataFrame(df.take(2), columns = df.columns)

Unnamed: 0,label,features,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k


In [11]:
train, test = df.randomSplit([0.7, 0.3], seed=2021)
print(train.count())
print(test.count())

22777
9784


In [12]:
from pyspark.ml.classification import RandomForestClassifier
# 随机森林
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rf_model = rf.fit(train)
predictions = rf_model.transform(test)
predictions.printSchema()
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: string (nullable = true)
 |-- capital_loss: string (nullable = true)
 |-- hours_per_week: string (nullable = true)
 |-- native_country: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



DataFrame[label: double, prediction: double, probability: vector, age: string, occupation: string]

In [13]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7116663558561737

In [17]:
fi = rf_model.featureImportances

In [18]:
print(len(fi))

22132


In [67]:
importancesList=[float(col) for col in  fi]

In [70]:
#colList=['tempcabinetnacelle_1sec','blade3tempbattbox_1sec','blade1tempbattbox_1sec']
result=dict(zip(cols,importancesList))
print(result)

{'age': 0.0, 'workclass': 0.0, 'fnlwgt': 0.0, 'education': 0.019343781065179207, 'education_num': 0.0, 'marital_status': 0.0, 'occupation': 0.0, 'relationship': 0.0, 'race': 0.0040582328515082495, 'sex': 0.0, 'capital_gain': 0.0, 'capital_loss': 0.0, 'hours_per_week': 0.0, 'native_country': 0.0, 'salary': 0.0}


In [71]:
data_df = pd.DataFrame([result])

In [61]:
result['age']

0.0

In [62]:
for col in cols:
    print(result[col])

0.0
0.0
0.0
0.019343781065179207
0.0
0.0
0.0
0.0
0.0040582328515082495
0.0
0.0
0.0
0.0
0.0
0.0


In [63]:
dd = pd.DataFrame()

In [64]:
for col in cols:
    #res = {"name":col,"importance":result[col]}
    res = pd.Series([col, result[col]],index=['name', 'importance'])
    dd = dd.append(res, ignore_index=True)

In [65]:
dd.sort_values(by="importance",ascending=False)

Unnamed: 0,importance,name
3,0.019344,education
8,0.004058,race
0,0.0,age
1,0.0,workclass
2,0.0,fnlwgt
4,0.0,education_num
5,0.0,marital_status
6,0.0,occupation
7,0.0,relationship
9,0.0,sex
