In [82]:
!pip install pyspark



In [83]:
import pyspark
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from google.colab import drive
import time
from pyspark.sql import SQLContext
drive.mount('/content/drive')
import collections
from pyspark.mllib.linalg import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.mllib.linalg import SparseVector
from scipy.spatial import distance
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
import json
conf = SparkConf().setMaster("local[8]").setAppName("Exercise")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType, StringType, FloatType
from pyspark.sql.functions import concat, lit
from pyspark.sql.functions import monotonically_increasing_id 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [84]:
df = spark.read.csv('/content/drive/MyDrive/BIGDATA/Week5/bank.csv', header = True, inferSchema=True)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [85]:
df = df.toPandas()
df.deposit = df.deposit.apply(lambda x: 1 if x == 'yes' else 0)

In [86]:
class_name = 'deposit'
df.groupby(class_name).count()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
deposit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873,5873
1,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289


In [87]:
df = spark.createDataFrame(df)

In [88]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in list(set(df.columns)-set(['date'])) ]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df).transform(df)
df_r.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+-------------+---------+-------------+--------------+--------------+--------------+-----------+-------------+-------------+--------------+---------+-------------+---------------+----------+-------------+---------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|month_index|default_index|job_index|contact_index|duration_index|campaign_index|poutcome_index|pdays_index|housing_index|marital_index|previous_index|day_index|deposit_index|education_index|loan_index|balance_index|age_index|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+-------------+---------+-------------+--------------+--------------+--------------+-----------+-------------+-------------+--------------

In [89]:
df_r = df_r.drop(df_r.deposit_index)

In [90]:
from pyspark.ml.feature import VectorAssembler

eature_names = df_r.columns[17:]
print(feature_names)
assembler = VectorAssembler()
assembler.setInputCols(feature_names).setOutputCol('features')
transformed_data = assembler.transform(df_r)

transformed_data.show()

['month_index', 'default_index', 'job_index', 'contact_index', 'duration_index', 'campaign_index', 'poutcome_index', 'pdays_index', 'housing_index', 'marital_index', 'previous_index', 'day_index', 'education_index', 'loan_index', 'balance_index', 'age_index']
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+-------------+---------+-------------+--------------+--------------+--------------+-----------+-------------+-------------+--------------+---------+---------------+----------+-------------+---------+--------------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|month_index|default_index|job_index|contact_index|duration_index|campaign_index|poutcome_index|pdays_index|housing_index|marital_index|previous_index|day_index|education_index|loan_index|balance_index|age_index|            features|
+---+---

In [91]:
[training_data, test_data] = transformed_data.randomSplit([0.8,0.2])

In [92]:
training_data.toPandas()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,month_index,default_index,job_index,contact_index,duration_index,campaign_index,poutcome_index,pdays_index,housing_index,marital_index,previous_index,day_index,education_index,loan_index,balance_index,age_index,features
0,18,student,single,primary,no,608,no,no,cellular,12,aug,267,1,-1,0,unknown,1,1.0,0.0,7.0,0.0,166.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0,0.0,249.0,64.0,"(1.0, 0.0, 7.0, 0.0, 166.0, 0.0, 0.0, 0.0, 0.0..."
1,18,student,single,unknown,no,3,no,no,cellular,25,aug,130,2,-1,0,unknown,1,1.0,0.0,7.0,0.0,28.0,1.0,0.0,0.0,0.0,1.0,0.0,26.0,3.0,0.0,2.0,64.0,"(1.0, 0.0, 7.0, 0.0, 28.0, 1.0, 0.0, 0.0, 0.0,..."
2,18,student,single,unknown,no,108,no,no,cellular,8,sep,169,1,-1,0,unknown,1,9.0,0.0,7.0,0.0,63.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,3.0,0.0,20.0,64.0,"(9.0, 0.0, 7.0, 0.0, 63.0, 0.0, 0.0, 0.0, 0.0,..."
3,18,student,single,unknown,no,108,no,no,cellular,10,aug,167,1,-1,0,unknown,1,1.0,0.0,7.0,0.0,62.0,0.0,0.0,0.0,0.0,1.0,0.0,27.0,3.0,0.0,20.0,64.0,"(1.0, 0.0, 7.0, 0.0, 62.0, 0.0, 0.0, 0.0, 0.0,..."
4,19,student,single,primary,no,103,no,no,cellular,10,jul,104,2,-1,0,unknown,1,2.0,0.0,7.0,0.0,32.0,1.0,0.0,0.0,0.0,1.0,0.0,27.0,2.0,0.0,105.0,60.0,"(2.0, 0.0, 7.0, 0.0, 32.0, 1.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8893,86,retired,divorced,unknown,no,157,no,no,telephone,7,sep,147,1,-1,0,unknown,0,9.0,0.0,5.0,2.0,138.0,0.0,0.0,0.0,0.0,2.0,0.0,16.0,3.0,0.0,113.0,68.0,"(9.0, 0.0, 5.0, 2.0, 138.0, 0.0, 0.0, 0.0, 0.0..."
8894,87,retired,married,secondary,no,433,no,no,telephone,15,dec,150,1,-1,0,unknown,0,11.0,0.0,5.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1123.0,69.0,"(11.0, 0.0, 5.0, 2.0, 3.0, 0.0, 0.0, 0.0, 0.0,..."
8895,88,retired,married,primary,no,648,no,no,telephone,3,sep,318,1,-1,0,unknown,0,9.0,0.0,5.0,2.0,246.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,2.0,0.0,1882.0,70.0,"(9.0, 0.0, 5.0, 2.0, 246.0, 0.0, 0.0, 0.0, 0.0..."
8896,88,retired,married,secondary,no,433,no,no,telephone,15,sep,161,1,274,1,failure,0,9.0,0.0,5.0,2.0,1.0,0.0,1.0,111.0,0.0,0.0,1.0,4.0,0.0,0.0,1123.0,70.0,"[9.0, 0.0, 5.0, 2.0, 1.0, 0.0, 1.0, 111.0, 0.0..."


In [93]:
class_name = 'deposit'

In [94]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol=class_name, maxIter=30)

In [95]:
M = model.fit(training_data)

In [96]:
predictions = M.transform(test_data)

In [98]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'deposit', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(predictions))

Logistic Regression Accuracy: 0.7773851590106007
