In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,NGram
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

def removePunctuation(text):
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace(';','')
    text = text.replace(':','')
    text = text.replace('ç','c')
    text = text.replace('ş','s')
    return text

spark = SparkSession.builder.getOrCreate()
textDF = spark.read.option('delimiter','\t').option('inferSchema','true').csv('datasets/movie_turkish_train.txt')
textDF = textDF.withColumnRenamed('_c0','orig')
textDF = textDF.withColumnRenamed('_c1','label')

myUDF = UserDefinedFunction(removePunctuation,StringType())

textDF = textDF.withColumn('Text',myUDF('orig'))


#textDF.show()
tokenizer = Tokenizer(inputCol='Text',outputCol='tokenized')
textDF = tokenizer.transform(textDF)

trStopWords = StopWordsRemover.loadDefaultStopWords('turkish')
sRemover = StopWordsRemover(inputCol='tokenized',outputCol='removed',stopWords=trStopWords)
textDF = sRemover.transform(textDF)


ng = NGram(inputCol='removed',outputCol='ng',n=2)
textDF = ng.transform(textDF)
#textDF.show()


vSize = 3000

vectorizer = CountVectorizer(inputCol='ng',outputCol='features',vocabSize=vSize)
textDF = vectorizer.fit(textDF).transform(textDF)

textDF = textDF.select('features','label')
trainDF, testDF = textDF.randomSplit([0.75,0.25],seed=123) 

mlpClassifier= MultilayerPerceptronClassifier(layers=[vSize,5,2])
model = mlpClassifier.fit(trainDF)

resultDF = model.transform(testDF)

eva = BinaryClassificationEvaluator()
successRate = eva.evaluate(resultDF)
print("Accuracy : ",successRate)
resultDF.show()

Accuracy :  0.8567498942022855
+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|        (3000,[],[])|    0|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    0|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    0|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    0|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    0|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    1|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    1|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    1|[-0.1681649410330...|[0.55999450111549...|       0.0|
|        (3000,[],[])|    1|[-0.1681649410330...|[0.55999450111549.