<a href="https://colab.research.google.com/github/baris-unver/SparkNLP/blob/main/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version
# Install pyspark
! pip install pyspark
# Install Spark NLP
! pip install spark-nlp

openjdk version "1.8.0_292"
OpenJDK Runtime Environment (build 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10)
OpenJDK 64-Bit Server VM (build 25.292-b10, mixed mode)
Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 52.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=d948f9d18937b1c5c93bdf5871016021982160efb6df40cbbe402d790bd1e591
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
Collecting spark-nlp
  Downloading spark_nlp-3.3.1-py2.py3-

In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-10-31 06:48:32--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-10-31 06:48:32--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-10-31 06:48:32--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [3]:
import sparknlp
spark = sparknlp.start()
print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.3.1
Apache Spark version: 3.0.3


In [4]:
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [5]:
# Downloading data set for use
! wget https://raw.githubusercontent.com/murat-gunay/NLP/master/02_NLP_Projects/2-project_2_Turkish_sparkNLP_Classification/turkish_categorical_corpus.csv
# Creating a Spark DataFrame
df_Spark = spark.read \
  .option("header", True) \
  .csv("turkish_categorical_corpus.csv")

--2021-10-31 06:50:49--  https://raw.githubusercontent.com/murat-gunay/NLP/master/02_NLP_Projects/2-project_2_Turkish_sparkNLP_Classification/turkish_categorical_corpus.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10627541 (10M) [text/plain]
Saving to: ‘turkish_categorical_corpus.csv’


2021-10-31 06:50:50 (97.8 MB/s) - ‘turkish_categorical_corpus.csv’ saved [10627541/10627541]



In [6]:
df_Spark.show(5, truncate=50)
df_Spark.groupBy("category").count().show()

+--------+--------------------------------------------------+
|category|                                              text|
+--------+--------------------------------------------------+
|siyaset | 3 milyon ile ön seçim vaadi mhp nin 10 olağan ...|
|siyaset | mesut_yılmaz yüce_divan da ceza alabilirdi pro...|
|siyaset | disko lar kaldırılıyor başbakan_yardımcısı arı...|
|siyaset | sarıgül anayasa_mahkemesi ne gidiyor mustafa_s...|
|siyaset | erdoğan idamın bir haklılık sebebi var demek k...|
+--------+--------------------------------------------------+
only showing top 5 rows

+----------+-----+
|  category|count|
+----------+-----+
|   kultur |  700|
|  siyaset |  700|
|teknoloji |  700|
|   saglik |  700|
|  ekonomi |  700|
|     spor |  700|
|    dunya |  700|
+----------+-----+



In [7]:
from pyspark.sql.functions import *
df_Spark = df_Spark.withColumn('text', regexp_replace('text', '_', ' '))

In [8]:
train_news, test_news = df_Spark.randomSplit([0.8, 0.2], seed = 100)



In [9]:
document = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")
sentence = SentenceDetector()\
  .setInputCols(['document'])\
  .setOutputCol('sentence')
token = Tokenizer()\
  .setInputCols(['sentence'])\
  .setOutputCol('token')
stop_words = StopWordsCleaner.pretrained('stopwords_tr', 'tr')\
  .setInputCols(["token"]) \
  .setOutputCol("cleanTokens") \
  .setCaseSensitive(False)
lemmatizer = LemmatizerModel.pretrained("lemma", "tr") \
  .setInputCols(["cleanTokens"]) \
  .setOutputCol("lemma")
finisher = Finisher() \
  .setInputCols(["lemma"]) \
  .setOutputCols(["token_features"]) \
  .setOutputAsArray(True) \
  .setCleanAnnotations(False)

stopwords_tr download started this may take some time.
Approximate size to download 2 KB
[OK!]
lemma download started this may take some time.
Approximate size to download 14.8 MB
[OK!]


In [17]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
hashTF = HashingTF(inputCol="token_features", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=5)
label_strIdx = StringIndexer(inputCol="category", outputCol="label")
logReg = LogisticRegression(maxIter=10)
label_Idxstr = IndexToString(inputCol="label", outputCol="article_class")
# Pipeline for Logistic Regression Classifier
nlp_pipeline_lr = Pipeline(
    stages=[document,
    sentence,
    token,
    stop_words,
    lemmatizer,
    finisher,
    hashTF,
    idf,
    label_strIdx,
    logReg,
    label_Idxstr])

In [19]:
classification_model_lr = nlp_pipeline_lr.fit(train_news)

In [24]:
from sklearn.metrics import classification_report, accuracy_score
# Converting Spark DF into Pandas DF
df_lr = classification_model_lr \
  .transform(test_news) \
  .select("category", "label", "prediction") \
  .toPandas()
print(classification_report(df_lr.label, df_lr.prediction))

              precision    recall  f1-score   support

         0.0       0.81      0.84      0.83       114
         1.0       0.87      0.85      0.86       132
         2.0       0.83      0.86      0.84       136
         3.0       0.94      0.96      0.95       142
         4.0       0.97      0.98      0.98       143
         5.0       0.92      0.86      0.89       149
         6.0       0.94      0.95      0.94       150

    accuracy                           0.90       966
   macro avg       0.90      0.90      0.90       966
weighted avg       0.90      0.90      0.90       966



In [25]:
classification_model_lr.save('model2')

In [51]:
predictions =  classification_model_lr.transform(test_news)

In [None]:
predictions.show(10)

In [None]:
df_lr = classification_model_lr \
   .transform(test_news) \
   .select("category", "label", "prediction","article_class") \
   .toPandas()
df_lr.head()