# PYSPARK2...

In [1]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()

21/09/18 05:44:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/09/18 05:44:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# create an RDD object
nums = sc.parallelize([1,2,3,4])
nums

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [3]:
type(nums)

pyspark.rdd.RDD

In [4]:
nums.take(1)

[1]

In [5]:
nums.collect()

[1, 2, 3, 4]

In [6]:
squared = nums.map(lambda x: x*x).collect()

In [7]:
print(nums.collect())
print(squared)

[1, 2, 3, 4]
[1, 4, 9, 16]


### sql context

In [8]:
from pyspark.sql import Row
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [9]:
list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)]

In [10]:
rdd = sc.parallelize(list_p)

In [11]:
ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))

In [12]:
df_ppl = sqlContext.createDataFrame(ppl)

In [13]:
df_ppl.show()

+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 29|
| Adam| 35|
|Henry| 50|
+-----+---+



In [14]:
df_ppl.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



### Machine Learning in Pyspark

In [15]:
import pandas as pd

In [16]:
url = "https://raw.githubusercontent.com/sadhana1002/PredictingSalaryClass-Classification/master/adult.csv"
names = names=['Age','workclass','fnlwgt','education','education_num','marital','occupation','relationship','race',
               'sex','capital_gain','capital_loss','hours_week','native_country','label']
df = sqlContext.createDataFrame(pd.read_csv(url, names=names))

In [17]:
df.printSchema()

root
 |-- Age: long (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: long (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: long (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: long (nullable = true)
 |-- capital_loss: long (nullable = true)
 |-- hours_week: long (nullable = true)
 |-- native_country: string (nullable = true)
 |-- label: string (nullable = true)



In [18]:
df.show(5, truncate = False)

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+----------+--------------+------+
|Age|workclass        |fnlwgt|education |education_num|marital            |occupation        |relationship  |race  |sex    |capital_gain|capital_loss|hours_week|native_country|label |
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+----------+--------------+------+
|39 | State-gov       |77516 | Bachelors|13           | Never-married     | Adm-clerical     | Not-in-family| White| Male  |2174        |0           |40        | United-States| <=50K|
|50 | Self-emp-not-inc|83311 | Bachelors|13           | Married-civ-spouse| Exec-managerial  | Husband      | White| Male  |0           |0           |13        | United-States| <=50K|
|38 | Private         |215646| HS-grad  |9            | Divorced          | Hand

In [19]:
from pyspark.sql.types import *

In [20]:
# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df

In [21]:
# convert dtypes of columns with continuous features to float dtype
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week']
df = convertColumn(df, CONTI_FEATURES, FloatType())

In [22]:
df.printSchema()

root
 |-- age: float (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: float (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: float (nullable = true)
 |-- capital_loss: float (nullable = true)
 |-- hours_week: float (nullable = true)
 |-- native_country: string (nullable = true)
 |-- label: string (nullable = true)



In [23]:
#select columns
df.select('age','fnlwgt').show(5)

+----+--------+
| age|  fnlwgt|
+----+--------+
|39.0| 77516.0|
|50.0| 83311.0|
|38.0|215646.0|
|53.0|234721.0|
|28.0|338409.0|
+----+--------+
only showing top 5 rows



In [24]:
df.groupBy("education").count().sort("count",ascending=True).show()



+-------------+-----+
|    education|count|
+-------------+-----+
|    Preschool|   51|
|      1st-4th|  168|
|      5th-6th|  333|
|    Doctorate|  413|
|         12th|  433|
|          9th|  514|
|  Prof-school|  576|
|      7th-8th|  646|
|         10th|  933|
|   Assoc-acdm| 1067|
|         11th| 1175|
|    Assoc-voc| 1382|
|      Masters| 1723|
|    Bachelors| 5355|
| Some-college| 7291|
|      HS-grad|10501|
+-------------+-----+



                                                                                

In [25]:
df.describe().show()



+-------+------------------+------------+------------------+-------------+-----------------+---------+-----------------+------------+-------------------+-------+------------------+-----------------+------------------+--------------+------+
|summary|               age|   workclass|            fnlwgt|    education|    education_num|  marital|       occupation|relationship|               race|    sex|      capital_gain|     capital_loss|        hours_week|native_country| label|
+-------+------------------+------------+------------------+-------------+-----------------+---------+-----------------+------------+-------------------+-------+------------------+-----------------+------------------+--------------+------+
|  count|             32561|       32561|             32561|        32561|            32561|    32561|            32561|       32561|              32561|  32561|             32561|            32561|             32561|         32561| 32561|
|   mean| 38.58164675532078|        null

                                                                                

In [26]:
# single column summary statistic
df.describe('capital_gain').show()

+-------+------------------+
|summary|      capital_gain|
+-------+------------------+
|  count|             32561|
|   mean|1077.6488437087312|
| stddev| 7385.292084840329|
|    min|               0.0|
|    max|           99999.0|
+-------+------------------+



In [27]:
# crosstab i.e. group by age
df.crosstab('age', 'label').sort("age_label").show()

+---------+------+-----+
|age_label| <=50K| >50K|
+---------+------+-----+
|     17.0|   395|    0|
|     18.0|   550|    0|
|     19.0|   710|    2|
|     20.0|   753|    0|
|     21.0|   717|    3|
|     22.0|   752|   13|
|     23.0|   865|   12|
|     24.0|   767|   31|
|     25.0|   788|   53|
|     26.0|   722|   63|
|     27.0|   754|   81|
|     28.0|   748|  119|
|     29.0|   679|  134|
|     30.0|   690|  171|
|     31.0|   705|  183|
|     32.0|   639|  189|
|     33.0|   684|  191|
|     34.0|   643|  243|
|     35.0|   659|  217|
|     36.0|   635|  263|
+---------+------+-----+
only showing top 20 rows



In [28]:
df.drop('education_num').columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_week',
 'native_country',
 'label']

In [29]:
# show only people older than 40
df.filter(df.age > 40).show(5)

+----+-----------------+--------+----------+-------------+--------------------+------------------+--------------+------+-------+------------+------------+----------+--------------+------+
| age|        workclass|  fnlwgt| education|education_num|             marital|        occupation|  relationship|  race|    sex|capital_gain|capital_loss|hours_week|native_country| label|
+----+-----------------+--------+----------+-------------+--------------------+------------------+--------------+------+-------+------------+------------+----------+--------------+------+
|50.0| Self-emp-not-inc| 83311.0| Bachelors|         13.0|  Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|         0.0|         0.0|      13.0| United-States| <=50K|
|53.0|          Private|234721.0|      11th|          7.0|  Married-civ-spouse| Handlers-cleaners|       Husband| Black|   Male|         0.0|         0.0|      40.0| United-States| <=50K|
|49.0|          Private|160187.0|       9th|          5.0| M

In [30]:
df.groupby('marital').agg({'capital_gain': 'mean'}).show()

+--------------------+------------------+
|             marital| avg(capital_gain)|
+--------------------+------------------+
|             Widowed| 571.0715005035247|
| Married-spouse-a...| 653.9832535885167|
|   Married-AF-spouse| 432.6521739130435|
|  Married-civ-spouse|1764.8595085470085|
|            Divorced| 728.4148098131893|
|       Never-married|376.58831788823363|
|           Separated| 535.5687804878049|
+--------------------+------------------+



### Data Preprocessing

In [31]:
from pyspark.sql.functions import *

In [32]:
# add an age squared feature
age_square = df.select(col("age")**2)
df = df.withColumn("age_square", col("age")**2)

In [33]:
df.printSchema()

root
 |-- age: float (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: float (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: float (nullable = true)
 |-- capital_loss: float (nullable = true)
 |-- hours_week: float (nullable = true)
 |-- native_country: string (nullable = true)
 |-- label: string (nullable = true)
 |-- age_square: double (nullable = true)



In [34]:
# rearrange columns
COLUMNS = ['age', 'age_square', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_week', 'native_country', 'label']
df = df.select(COLUMNS)

In [35]:
# Return 1st row of dataframe
df.first()

Row(age=39.0, age_square=1521.0, workclass=' State-gov', fnlwgt=77516.0, education=' Bachelors', education_num=13.0, marital=' Never-married', occupation=' Adm-clerical', relationship=' Not-in-family', race=' White', sex=' Male', capital_gain=2174.0, capital_loss=0.0, hours_week=40.0, native_country=' United-States', label=' <=50K')

In [36]:
df.filter(df.native_country == 'Holand-Netherlands').count()

0

In [37]:
df.groupby('native_country').agg({'native_country': 'count'}).sort(asc("count(native_country)")).show()

+--------------------+---------------------+
|      native_country|count(native_country)|
+--------------------+---------------------+
|  Holand-Netherlands|                    1|
|            Scotland|                   12|
|            Honduras|                   13|
|             Hungary|                   13|
| Outlying-US(Guam...|                   14|
|          Yugoslavia|                   16|
|            Thailand|                   18|
|                Laos|                   18|
|            Cambodia|                   19|
|     Trinadad&Tobago|                   19|
|                Hong|                   20|
|             Ireland|                   24|
|             Ecuador|                   28|
|              Greece|                   29|
|              France|                   29|
|                Peru|                   31|
|           Nicaragua|                   34|
|            Portugal|                   37|
|                Iran|                   43|
|         

In [38]:
# removing the single row
df_remove = df.filter(df.native_country != 'Holand-Netherlands')

### PIPELINING

In [39]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [40]:
### Example encoder

stringIndexer = StringIndexer(inputCol="workclass", outputCol="workclass_encoded")
model = stringIndexer.fit(df)
indexed = model.transform(df)
encoder = OneHotEncoder(dropLast=False, inputCol="workclass_encoded", outputCol="workclass_vec").fit(indexed)
encoded = encoder.transform(indexed)
encoded.show(2)

+----+----------+-----------------+-------+----------+-------------+-------------------+----------------+--------------+------+-----+------------+------------+----------+--------------+------+-----------------+-------------+
| age|age_square|        workclass| fnlwgt| education|education_num|            marital|      occupation|  relationship|  race|  sex|capital_gain|capital_loss|hours_week|native_country| label|workclass_encoded|workclass_vec|
+----+----------+-----------------+-------+----------+-------------+-------------------+----------------+--------------+------+-----+------------+------------+----------+--------------+------+-----------------+-------------+
|39.0|    1521.0|        State-gov|77516.0| Bachelors|         13.0|      Never-married|    Adm-clerical| Not-in-family| White| Male|      2174.0|         0.0|      40.0| United-States| <=50K|              4.0|(9,[4],[1.0])|
|50.0|    2500.0| Self-emp-not-inc|83311.0| Bachelors|         13.0| Married-civ-spouse| Exec-manage

In [41]:
from pyspark.ml import Pipeline

In [42]:
# Encode Categorical Data
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']
stages = [] # stages in our Pipeline
for categoricalCol in CATE_FEATURES:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [43]:
# encode the label
label_stringIdx =  StringIndexer(inputCol="label", outputCol="newlabel")
stages += [label_stringIdx]

In [44]:
assemblerInputs = [c + "classVec" for c in CATE_FEATURES] + CONTI_FEATURES

In [45]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [46]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)

In [47]:
model.columns

['age',
 'age_square',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_week',
 'native_country',
 'label',
 'workclassIndex',
 'workclassclassVec',
 'educationIndex',
 'educationclassVec',
 'maritalIndex',
 'maritalclassVec',
 'occupationIndex',
 'occupationclassVec',
 'relationshipIndex',
 'relationshipclassVec',
 'raceIndex',
 'raceclassVec',
 'sexIndex',
 'sexclassVec',
 'native_countryIndex',
 'native_countryclassVec',
 'newlabel',
 'features']

### Build Classifier

In [48]:
# converting to dense vector for efficiency purposes
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x: (x["newlabel"], DenseVector(x["features"])))

In [49]:
df_train = sqlContext.createDataFrame(input_data, ["label", "features"])
df_train.show(2)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[0.0,0.0,0.0,0.0,...|
|  0.0|[0.0,1.0,0.0,0.0,...|
+-----+--------------------+
only showing top 2 rows



In [50]:
# Split the data into train and test sets
train_data, test_data = df_train.randomSplit([.8,.2],seed=1234)

In [51]:
train_data.groupby('label').agg({'label': 'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|       19841|
|  1.0|        6299|
+-----+------------+



In [52]:
from pyspark.ml.classification import LogisticRegression

# Initialize `lr`
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)

# Fit the data to the model
linearModel = lr.fit(train_data)

21/09/18 05:48:34 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/09/18 05:48:34 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [53]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(linearModel.coefficients))
print("Intercept: " + str(linearModel.intercept))

Coefficients: [-0.07862676190898268,-0.12685154711836472,-0.08483571074133787,-0.16304986499396437,-0.12789996748064827,0.18152619311181464,0.19162909004707776,-0.5964172032665382,-0.19247328636572683,-0.07182824573351107,0.22563244231629012,0.4011787833698674,-0.006138082546978121,-0.30949425809526904,-0.04622714970574136,-0.31733522096117256,-0.4294498386571168,0.5194375473741066,-0.3959852254251923,-0.22080038821934414,0.6086344876643154,-0.36811683041036564,-0.38154324261483524,0.3194863268496144,-0.3509006679046948,-0.20688122864747688,-0.21282976205403806,-0.1445918417263158,-0.12519218981003666,0.17687174182921886,-0.06356788538844274,0.28935221625403873,-0.1155862629017342,0.036535666295128125,-0.2975997952605835,-0.21377604009863269,-0.1647801951473389,-0.10551411363274672,-0.3006489731441749,-0.3431492575217507,0.1391303158641853,0.19703179899956813,-0.26740432789693186,0.25207959738035196,-0.19086954246072682,-0.30542384712187737,-0.24880393884618424,0.45692826116644714,-0.0

In [54]:
# Make predictions on test data using the transform() method.
predictions = linearModel.transform(test_data)

In [55]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [56]:
selected = predictions.select("label", "prediction", "probability")
selected.show(20)

[Stage 88:>                                                         (0 + 1) / 1]

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.94922375088895...|
|  0.0|       0.0|[0.68160076275416...|
|  0.0|       0.0|[0.87267381703213...|
|  0.0|       0.0|[0.75529205823388...|
|  0.0|       1.0|[0.48940275753711...|
|  0.0|       0.0|[0.56232752328745...|
|  0.0|       0.0|[0.60649324003818...|
|  0.0|       0.0|[0.76644552158144...|
|  0.0|       1.0|[0.46573604594580...|
|  0.0|       1.0|[0.41775445782641...|
|  0.0|       0.0|[0.80996035172040...|
|  0.0|       0.0|[0.79025477926771...|
|  0.0|       0.0|[0.83422389007416...|
|  0.0|       0.0|[0.85567228208777...|
|  0.0|       0.0|[0.82706419236230...|
|  0.0|       0.0|[0.77628740663001...|
|  0.0|       0.0|[0.81226148159560...|
|  0.0|       0.0|[0.81829333045441...|
|  0.0|       0.0|[0.88403691585775...|
|  0.0|       0.0|[0.82974989321008...|
+-----+----------+--------------------+
only showing top 20 rows



                                                                                

### evaluate model

In [57]:
cm = predictions.select("label", "prediction")

In [58]:
cm.groupby('label').agg({'label': 'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|        4879|
|  1.0|        1542|
+-----+------------+



In [59]:
cm.groupby('prediction').agg({'prediction': 'count'}).show()

                                                                                

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             5770|
|       1.0|              651|
+----------+-----------------+



In [60]:
cm.filter(cm.label == cm.prediction).count() / cm.count()

                                                                                

0.8272854695530292

In [61]:
def accuracy_m(model): 
    predictions = model.transform(test_data)
    cm = predictions.select("label", "prediction")
    acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
    print("Model accuracy: %.3f%%" % (acc * 100)) 
accuracy_m(model = linearModel)

[Stage 115:>                                                        (0 + 4) / 4]

Model accuracy: 82.729%




In [62]:
### Use ROC 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

                                                                                

0.8951509805782415
areaUnderROC
