In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.6.4

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 66kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 43.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=bc470d47088635587595049280aec48e403937cfde8910b23670f7d2cc442df2
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sparknlp 

spark = sparknlp.start()

In [4]:
df = spark.read.option("header", True).csv('drive/My Drive/news.csv').drop('Title')

In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
# pn_udf = udf(lambda x: [str(x)], ArrayType(StringType()) )
pn_udf = udf(lambda x: str(x), StringType())
dfl = df.withColumn('label',pn_udf('Class')).drop('Class')

In [6]:
df_l = dfl.selectExpr("label as category","Description as text")
df_l.show(5)

+--------+--------------------+
|category|                text|
+--------+--------------------+
|       3|Reuters - Short-s...|
|       3|Reuters - Private...|
|       3|Reuters - Soaring...|
|       3|Reuters - Authori...|
|       3|AFP - Tearaway wo...|
+--------+--------------------+
only showing top 5 rows



In [13]:
df_l.count()

120000

In [7]:
df_train, df_test = df_l.randomSplit([.7, .3])

In [8]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

In [9]:

document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
tokenizer = Tokenizer()\
  .setInputCols(['document'])\
  .setOutputCol('token')
normalizer = Normalizer()\
  .setInputCols(['token'])\
  .setOutputCol('normalized')
stopwords_cleaner = StopWordsCleaner()\
  .setInputCols(['normalized'])\
  .setOutputCol('cleanTokens')\
  .setCaseSensitive(False)

word_embeddings = BertEmbeddings\
  .pretrained("bert_base_cased", "en")\
  .setInputCols(['document','cleanTokens'])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(False)

use = UniversalSentenceEncoder.pretrained() \
 .setInputCols(["document","embeddings"])\
 .setOutputCol("sentence_embeddings")


classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        word_embeddings,
        use,
        classsifierdl
    ])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [10]:
bert_pipelineModel = pipeline.fit(df_train)

In [11]:
from sklearn.metrics import classification_report, accuracy_score
df = bert_pipelineModel.transform(df_test).select('category','text','class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

In [14]:
from sklearn.metrics import confusion_matrix
print('confusion matrix is ')
print(confusion_matrix(df.category, df.result))
print('accuracy is:')
print(classification_report(df.category, df.result))
print('f1 score is:')
print(accuracy_score(df.category, df.result))

confusion matrix is 
[[7929  276  455  328]
 [  93 8717   58   70]
 [ 394   84 7448 1277]
 [ 292   62  587 8073]]
accuracy is:
              precision    recall  f1-score   support

           1       0.91      0.88      0.90      8988
           2       0.95      0.98      0.96      8938
           3       0.87      0.81      0.84      9203
           4       0.83      0.90      0.86      9014

    accuracy                           0.89     36143
   macro avg       0.89      0.89      0.89     36143
weighted avg       0.89      0.89      0.89     36143

f1 score is:
0.8899925296737957
