In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

def removePunctuation(text):
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace(';','')
    text = text.replace(':','')
    text = text.replace('ç','c')
    text = text.replace('ş','s')
    return text

spark = SparkSession.builder.getOrCreate()
textDF = spark.read.option('delimiter','\t').option('inferSchema','true').csv('datasets/movie_turkish_train.txt')
textDF = textDF.withColumnRenamed('_c0','orig')
textDF = textDF.withColumnRenamed('_c1','label')

myUDF = UserDefinedFunction(removePunctuation,StringType())

textDF = textDF.withColumn('Text',myUDF('orig'))


#textDF.show()
tokenizer = Tokenizer(inputCol='Text',outputCol='tokenized')
textDF = tokenizer.transform(textDF)

trStopWords = StopWordsRemover.loadDefaultStopWords('turkish')
sRemover = StopWordsRemover(inputCol='tokenized',outputCol='removed',stopWords=trStopWords)
textDF = sRemover.transform(textDF)

vSize = 3000

vectorizer = CountVectorizer(inputCol='removed',outputCol='features',vocabSize=vSize)
textDF = vectorizer.fit(textDF).transform(textDF)

textDF = textDF.select('features','label')
trainDF, testDF = textDF.randomSplit([0.75,0.25],seed=123) 

mlpClassifier= MultilayerPerceptronClassifier(layers=[vSize,5,2])
model = mlpClassifier.fit(trainDF)

resultDF = model.transform(testDF)

eva = BinaryClassificationEvaluator()
successRate = eva.evaluate(resultDF)
print("Accuracy : ",successRate)
resultDF.show()

Accuracy :  0.9142736231183116
+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(3000,[0,1,2,3,5,...|    1|[-7.6775772113474...|[3.03041489059042...|       1.0|
|(3000,[0,1,2,3,6,...|    0|[19.4932738963183...|[1.0,9.1898637980...|       0.0|
|(3000,[0,1,2,3,7,...|    1|[21.0027392902773...|[1.0,5.4634869348...|       0.0|
|(3000,[0,1,2,3,7,...|    0|[17.2848249669199...|[0.99999999999999...|       0.0|
|(3000,[0,1,2,3,9,...|    1|[-13.497930568764...|[3.07889798084543...|       1.0|
|(3000,[0,1,2,3,19...|    0|[19.5160805425421...|[1.0,1.0061128358...|       0.0|
|(3000,[0,1,2,4,16...|    0|[18.1136112194893...|[0.99999999999999...|       0.0|
|(3000,[0,1,2,5,6,...|    1|[-16.881695622704...|[5.90384063554461...|       1.0|
|(3000,[0,1,2,5,6,...|    1|[9.60995731436413...|[0.99999999067833.

In [None]:
import pyspark.sql.functions as F

df2 = df.select(
    [F.regexp_replace(col, r',|\.|&|\\|\||-|_', '').alias(col) for col in df.columns]
)