In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover,HashingTF,IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

def removePunctuation(text):
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace(';','')
    text = text.replace(':','')
    text = text.replace('ç','c')
    text = text.replace('ş','s')
    return text

spark = SparkSession.builder.getOrCreate()
textDF = spark.read.option('delimiter','\t').option('inferSchema','true').csv('datasets/movie_turkish_train.txt')
textDF = textDF.withColumnRenamed('_c0','orig')
textDF = textDF.withColumnRenamed('_c1','label')

myUDF = UserDefinedFunction(removePunctuation,StringType())

textDF = textDF.withColumn('Text',myUDF('orig'))


#textDF.show()
tokenizer = Tokenizer(inputCol='Text',outputCol='tokenized')
textDF = tokenizer.transform(textDF)

trStopWords = StopWordsRemover.loadDefaultStopWords('turkish')
sRemover = StopWordsRemover(inputCol='tokenized',outputCol='removed',stopWords=trStopWords)
textDF = sRemover.transform(textDF)

vSize = 4000

tf = HashingTF(inputCol='removed',outputCol='tf',numFeatures=vSize)
textDF = tf.transform(textDF)
idf = IDF(inputCol='tf',outputCol='features')
textDF = idf.fit(textDF).transform(textDF)
#textDF.show()

textDF = textDF.select('features','label')
trainDF, testDF = textDF.randomSplit([0.75,0.25],seed=123) 

rfClassifier= RandomForestClassifier()
model = rfClassifier.fit(trainDF)

resultDF = model.transform(testDF)

eva = BinaryClassificationEvaluator()
successRate = eva.evaluate(resultDF)
print("Accuracy : ",successRate)
resultDF.show()

Accuracy :  0.7966269841269842
+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(4000,[1,19,215,3...|    0|[10.6824717469211...|[0.53412358734605...|       0.0|
|(4000,[1,452,847,...|    1|[10.5977503688360...|[0.52988751844180...|       0.0|
|(4000,[4,6,194,38...|    0|[10.5046573669400...|[0.52523286834700...|       0.0|
|(4000,[4,120,215,...|    0|[10.5715373191994...|[0.52857686595997...|       0.0|
|(4000,[5,120,592,...|    1|[9.23344876423652...|[0.46167243821182...|       1.0|
|(4000,[5,215,265,...|    0|[10.4189639056833...|[0.52094819528416...|       0.0|
|(4000,[5,354,676,...|    0|[11.3251850960092...|[0.56625925480046...|       0.0|
|(4000,[5,366,844,...|    1|[10.5624623395617...|[0.52812311697808...|       0.0|
|(4000,[5,534,853,...|    0|[10.8156563009385...|[0.54078281504692.

In [None]:
import pyspark.sql.functions as F

df2 = df.select(
    [F.regexp_replace(col, r',|\.|&|\\|\||-|_', '').alias(col) for col in df.columns]
)