In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover,HashingTF,IDF
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

def removePunctuation(text):
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace(';','')
    text = text.replace(':','')
    text = text.replace('รง','c')
    text = text.replace('ล','s')
    return text

spark = SparkSession.builder.getOrCreate()
textDF = spark.read.option('delimiter','\t').option('inferSchema','true').csv('datasets/movie_turkish_train.txt')
textDF = textDF.withColumnRenamed('_c0','orig')
textDF = textDF.withColumnRenamed('_c1','label')

myUDF = UserDefinedFunction(removePunctuation,StringType())

textDF = textDF.withColumn('Text',myUDF('orig'))


#textDF.show()
tokenizer = Tokenizer(inputCol='Text',outputCol='tokenized')
textDF = tokenizer.transform(textDF)

trStopWords = StopWordsRemover.loadDefaultStopWords('turkish')
sRemover = StopWordsRemover(inputCol='tokenized',outputCol='removed',stopWords=trStopWords)
textDF = sRemover.transform(textDF)

vSize = 4000

tf = HashingTF(inputCol='removed',outputCol='tf',numFeatures=vSize)
textDF = tf.transform(textDF)
idf = IDF(inputCol='tf',outputCol='features')
textDF = idf.fit(textDF).transform(textDF)
#textDF.show()

textDF = textDF.select('features','label')
trainDF, testDF = textDF.randomSplit([0.75,0.25],seed=123) 

mlpClassifier= MultilayerPerceptronClassifier(layers=[vSize,5,2])
model = mlpClassifier.fit(trainDF)

resultDF = model.transform(testDF)

eva = BinaryClassificationEvaluator()
successRate = eva.evaluate(resultDF)
print("Accuracy : ",successRate)
resultDF.show()

Accuracy :  0.8886483886483891
+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(4000,[1,19,215,3...|    0|[10.4468761373359...|[0.99999999914874...|       0.0|
|(4000,[1,452,847,...|    1|[-7.8628942328295...|[4.35380198507654...|       1.0|
|(4000,[4,6,194,38...|    0|[10.4780836049589...|[0.99999999920435...|       0.0|
|(4000,[4,120,215,...|    0|[9.04060283588073...|[0.99999998209185...|       0.0|
|(4000,[5,120,592,...|    1|[9.15674644775305...|[0.99999998662080...|       0.0|
|(4000,[5,215,265,...|    0|[10.5246671394350...|[0.99999999928024...|       0.0|
|(4000,[5,354,676,...|    0|[7.22375798971401...|[0.99999911645315...|       0.0|
|(4000,[5,366,844,...|    1|[9.43823924505575...|[0.99999999254359...|       0.0|
|(4000,[5,534,853,...|    0|[10.3178276432579...|[0.99999999887146.

In [None]:
import pyspark.sql.functions as F

df2 = df.select(
    [F.regexp_replace(col, r',|\.|&|\\|\||-|_', '').alias(col) for col in df.columns]
)