In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.5.0

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 30kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 41.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=d12ae20410104192d76fdcdb936b4997cc8fa05ad29896e816bff7d85b32dc94
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

In [None]:
import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
Dataset = spark.read.option("header", True).csv('drive/My Drive/bbc-text.csv')

In [None]:
Dataset.show(10)

+-------------+--------------------+
|     category|                text|
+-------------+--------------------+
|         tech|tv future in the ...|
|     business|worldcom boss  le...|
|        sport|tigers wary of fa...|
|        sport|yeading face newc...|
|entertainment|ocean s twelve ra...|
|     politics|howard hits back ...|
|     politics|blair prepares to...|
|        sport|henman hopes ende...|
|        sport|wilkinson fit to ...|
|entertainment|last star wars  n...|
+-------------+--------------------+
only showing top 10 rows



In [None]:
df_train, df_test = Dataset.randomSplit([.7, .3])

In [None]:
df_train.show(5)

+--------+--------------------+
|category|                text|
+--------+--------------------+
|business|absa and barclays...|
|business|aids and climate ...|
|business|alfa romeos  to g...|
|business|algeria hit by fu...|
|business|amex shares up on...|
+--------+--------------------+
only showing top 5 rows



In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd


In [None]:
document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')
tokenizer = Tokenizer()\
  .setInputCols(['document'])\
  .setOutputCol('token')
normalizer = Normalizer()\
  .setInputCols(['token'])\
  .setOutputCol('normalized')
stopwords_cleaner = StopWordsCleaner()\
  .setInputCols(['normalized'])\
  .setOutputCol('cleanTokens')\
  .setCaseSensitive(False)
lemma = LemmatizerModel.pretrained('lemma_antbnc')\
  .setInputCols(['cleanTokens'])\
  .setOutputCol('lemma')
word_embeddings = BertEmbeddings\
  .pretrained('bert_base_cased','en')\
  .setInputCols(['document','lemma'])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(False)
embeddingsSentence = SentenceEmbeddings()\
  .setInputCols(['document','embeddings'])\
  .setOutputCol("sentence_embeddings")\
  .setPoolingStrategy('AVERAGE')
classifierdl = ClassifierDLApproach()\
  .setInputCols(['sentence_embeddings'])\
  .setOutputCol('class')\
  .setLabelColumn('category')\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)
bert_pipeline = Pipeline(
    stages=[document_assembler,
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemma,
            word_embeddings,
            embeddingsSentence,
            classifierdl,
    ]
)


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [None]:
bert_pipelineModel = bert_pipeline.fit(df_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
df = bert_pipelineModel.transform(df_test).select('category','text','class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

               precision    recall  f1-score   support

     business       0.96      0.96      0.96       153
entertainment       0.93      0.94      0.94       109
     politics       0.94      0.94      0.94       109
        sport       0.98      0.99      0.99       153
         tech       0.96      0.92      0.94       116

     accuracy                           0.96       640
    macro avg       0.95      0.95      0.95       640
 weighted avg       0.96      0.96      0.96       640

0.95625
