In [30]:
import re
import pyspark
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import col, split
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, PCA ,StopWordsRemover,StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier,LogisticRegression,NaiveBayes

In [None]:
spark = SparkSession.builder.appName("spooky").master('local[*]').config("spark.executor.memory", '6G').config("spark.driver.memory", '6G').config("spark.driver.maxResultSize", '7G').getOrCreate()

df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/home/anant/data-science_2/spam classification/spam.csv')

In [2]:
df=df.drop('_c2').drop('_c3').drop('_c4')

In [3]:
df = df.selectExpr("v1 as class", "v2 as text")

In [4]:
from pyspark.sql import functions as F
df=df.withColumn('String_Label', F.regexp_replace('class', '\\W', ''))

In [5]:
df=df.filter(df.text != '')

In [6]:
# tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
resultantdf=regexTokenizer.transform(df)
resultantdf.show()

+-----+--------------------+------------+--------------------+
|class|                text|String_Label|           tokenized|
+-----+--------------------+------------+--------------------+
|  ham|Go until jurong p...|         ham|[go, until, juron...|
|  ham|Ok lar... Joking ...|         ham|[ok, lar, joking,...|
| spam|Free entry in 2 a...|        spam|[free, entry, in,...|
|  ham|U dun say so earl...|         ham|[u, dun, say, so,...|
|  ham|Nah I don't think...|         ham|[nah, i, don, t, ...|
| spam|FreeMsg Hey there...|        spam|[freemsg, hey, th...|
|  ham|Even my brother i...|         ham|[even, my, brothe...|
|  ham|As per your reque...|         ham|[as, per, your, r...|
| spam|WINNER!! As a val...|        spam|[winner, as, a, v...|
| spam|Had your mobile 1...|        spam|[had, your, mobil...|
|  ham|I'm gonna be home...|         ham|[i, m, gonna, be,...|
| spam|SIX chances to wi...|        spam|[six, chances, to...|
| spam|URGENT! You have ...|        spam|[urgent, you, 

In [7]:
# removal of stop word
stopwordsRemover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")

resultantdf=stopwordsRemover.transform(resultantdf)

resultantdf=resultantdf.drop('text')

In [8]:
#count vectorizer implemetation
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model=cv.fit(resultantdf)

result=model.transform(resultantdf)

result=result.drop('tokenized').drop('filtered')

In [9]:
result=result.drop('class')

In [10]:
result.show()

+------------+--------------------+
|String_Label|            features|
+------------+--------------------+
|         ham|(8537,[11,16,37,6...|
|         ham|(8537,[0,9,244,36...|
|        spam|(8537,[2,10,23,24...|
|         ham|(8537,[0,58,85,86...|
|         ham|(8537,[53,136,366...|
|        spam|(8537,[9,15,21,26...|
|         ham|(8537,[15,132,286...|
|         ham|(8537,[149,157,31...|
|        spam|(8537,[1,64,82,14...|
|        spam|(8537,[0,1,10,31,...|
|         ham|(8537,[3,22,29,33...|
|        spam|(8537,[6,17,21,24...|
|        spam|(8537,[10,24,26,5...|
|         ham|(8537,[45,77,84,1...|
|         ham|(8537,[479,677],[...|
|        spam|(8537,[24,37,80,1...|
|         ham|(8537,[3,41,63,27...|
|         ham|(8537,[0,2,71,73,...|
|         ham|(8537,[0,72,91,13...|
|        spam|(8537,[5,24,26,42...|
+------------+--------------------+
only showing top 20 rows



In [11]:
indexer = StringIndexer(inputCol="String_Label", outputCol="label")

In [12]:
indexed = indexer.fit(result).transform(result)

In [16]:
indexed=indexed.drop('String_Label')

In [17]:
indexed.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(8537,[11,16,37,6...|  0.0|
|(8537,[0,9,244,36...|  0.0|
+--------------------+-----+
only showing top 2 rows



In [19]:
# Splitting of data set
(trainingData, testData) = indexed.randomSplit([0.7, 0.3], seed = 100)

In [22]:
trainingData.show(2)
testData.show(2)

+------------+-----+
|    features|label|
+------------+-----+
|(8537,[],[])|  0.0|
|(8537,[],[])|  0.0|
+------------+-----+
only showing top 2 rows

+----------------+-----+
|        features|label|
+----------------+-----+
|    (8537,[],[])|  0.0|
|(8537,[0],[1.0])|  0.0|
+----------------+-----+
only showing top 2 rows



# random forest

In [35]:
 # Using RandomForestClassifier to train the model
rf = RandomForestClassifier(labelCol="label",featuresCol="features",numTrees = 30,maxDepth = 20)
# Train model with Training Data
rfModel = rf.fit(trainingData)
# Prediction
predictions = rfModel.transform(testData)

In [36]:
predictions.show(2)

+----------------+-----+--------------------+--------------------+----------+
|        features|label|       rawPrediction|         probability|prediction|
+----------------+-----+--------------------+--------------------+----------+
|    (8537,[],[])|  0.0|[28.4578731936966...|[0.94859577312322...|       0.0|
|(8537,[0],[1.0])|  0.0|[28.4578731936966...|[0.94859577312322...|       0.0|
+----------------+-----+--------------------+--------------------+----------+
only showing top 2 rows



In [37]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9444926928147461

# logistic Regression

In [29]:
lr=LogisticRegression(labelCol="label",featuresCol="features")
# train the model 
lrmodel=lr.fit(trainingData)
# Prediction
predictions = rfModel.transform(testData)
evaluatorlr = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatorlr.evaluate(predictions)

0.9542490584776779

# navies Bayes

In [31]:
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions= model.transform(testData)
evaluatorlr = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatorlr.evaluate(predictions)

0.9846263105130661

# Support Vector Machine

In [40]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(trainingData)

# Print the coefficients and intercept for linearsSVC
#print("Coefficients: " + str(lsvcModel.coefficients))
#print("Intercept: " + str(lsvcModel.intercept))

In [41]:
predictions = lsvcModel.transform(testData)
evaluatorsvm = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatorsvm.evaluate(predictions)

0.9694534959727179

# K Mean Clustering

In [62]:
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
kmodel = kmeans.fit(trainingData)
predictions=kmodel.transform(testData)

In [63]:
predictions.show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|        (8537,[],[])|  0.0|         0|
|    (8537,[0],[1.0])|  0.0|         0|
|    (8537,[0],[1.0])|  0.0|         0|
|(8537,[0,1],[1.0,...|  0.0|         0|
|(8537,[0,1,2,5,31...|  1.0|         1|
|(8537,[0,1,2,5,31...|  1.0|         1|
|(8537,[0,1,2,5,31...|  1.0|         1|
|(8537,[0,1,2,5,43...|  1.0|         1|
|(8537,[0,1,2,13,1...|  1.0|         0|
|(8537,[0,1,2,13,1...|  1.0|         0|
|(8537,[0,1,2,19,6...|  1.0|         1|
|(8537,[0,1,2,19,6...|  1.0|         1|
|(8537,[0,1,2,19,6...|  1.0|         1|
|(8537,[0,1,2,34,4...|  0.0|         0|
|(8537,[0,1,2,113,...|  1.0|         0|
|(8537,[0,1,3,378,...|  0.0|         0|
|(8537,[0,1,5,14,1...|  0.0|         1|
|(8537,[0,1,6,11,3...|  0.0|         0|
|(8537,[0,1,9,30,6...|  0.0|         0|
|(8537,[0,1,10,22,...|  1.0|         0|
+--------------------+-----+----------+
only showing top 20 rows



In [64]:
# converting column datatype from integer to double
from pyspark.sql.types import DoubleType
predictions = predictions.withColumn("prediction", predictions["prediction"].cast(DoubleType()))
evaluatorsvm = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatorsvm.evaluate(predictions)

0.8105478731033524

In [None]:
model=arge