<a href="https://colab.research.google.com/github/aydinmyilmaz/BootCampAssignments/blob/master/OneMillion_Classification_with_Medical_WordEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.4

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
[K     |████████████████████████████████| 215.7MB 62kB/s 
[K     |████████████████████████████████| 204kB 37.6MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133kB 2.8MB/s 
[?25h

In [2]:
import sparknlp

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

Spark NLP version 2.5.4
Apache Spark version: 2.4.4


In [3]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz

--2020-09-10 20:01:26--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1278030050 (1.2G) [binary/octet-stream]
Saving to: ‘cc.de.300.vec.gz’


2020-09-10 20:02:37 (17.4 MB/s) - ‘cc.de.300.vec.gz’ saved [1278030050/1278030050]



In [4]:
!gunzip cc.de.300.vec.gz

In [5]:

df1 = spark.read.format("csv").option("header", "true").load("oneMillion_3271.csv")
df1.head(10)

[Row(_c0='2952', label='0', ID_Post='3326', Clean_text='Top qualifizierte Leute verdienen auch viel.', len='44'),
 Row(_c0='3055', label='1', ID_Post='5321', Clean_text='Gott sei dank ist für sie eine Umfrage alles, alles Negative wird für sie wegen einer Umfrage unwichtig, weil Manager befragt wurden...', len='135'),
 Row(_c0='3232', label='0', ID_Post='5590', Clean_text='Sorry, aber die FPÖ tut eigentlich gar nichts und gewinnt TROTZDEM.', len='67'),
 Row(_c0='3278', label='1', ID_Post='6015', Clean_text='Weil es dein meisten Leuten verständlicherweise vollkommen egal ist, was die Gesellschaft oder jede andere dahergelaufene Diskursgruppe von ihnen hält.', len='151'),
 Row(_c0='3301', label='0', ID_Post='8213', Clean_text='Na wer weis was da vorgefallen ist...', len='37'),
 Row(_c0='3333', label='1', ID_Post='9724', Clean_text='Gabalier?Künstler?Bruahahaha!', len='29'),
 Row(_c0='3360', label='0', ID_Post='12986', Clean_text='Very Urgent Bastard,l am Mr. Benson Baakari, the branch ma

In [6]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler()\
    .setInputCol("Clean_text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

pattern='^.*(?=.{8,})(?=.*[a-zA-ZäöüÄÖÜß])(?=.*\d).*$'
pattern2 = '''[~!@#$^%&*\\(\\)_+={}\\[\\]|;:\"'<,>.?`/\\\\–[0-9]]'''
#pattern2 = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(False)\
    .setCleanupPatterns([pattern,pattern2]) # remove punctuations (keep alphanumeric chars)
    # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

'''
custom_embeddings = WordEmbeddings()\
  .setInputCols(["document", "normalized"])\
  .setOutputCol("word_embeddings")\
  .setStoragePath('cc.de.300.vec', "TEXT")\
  .setDimension(300)

'''

custom_embeddings = WordEmbeddings()\
  .setInputCols(["document", "normalized"])\
  .setOutputCol("word_embeddings")\
  .setStoragePath('model_fst200.bin', "BINARY")\
  .setDimension(200) #ATTENTION to Dimension


embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "word_embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("prediction")\
  .setLabelColumn("label")\
  .setMaxEpochs(100)\
  .setEnableOutputLogs(True)


pipeline = Pipeline(
    stages = [documentAssembler,
              tokenizer,
              normalizer,
              custom_embeddings,
              embeddingsSentence,
              classsifierdl
    ])


In [7]:
# set seed for reproducibility
(trainingData, testData) = df1.randomSplit([0.8, 0.2], seed = 575)
print("Test Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Test Dataset Count: 2616
Test Dataset Count: 655


In [8]:
pipelineModel = pipeline.fit(trainingData)

In [9]:
df2 = pipelineModel.transform(testData).select("document","word_embeddings","label",'prediction.result').toPandas()

from sklearn.metrics import classification_report, accuracy_score

#df3 = pipelineModel.transform(testData).select("document","word_embeddings","label",'prediction.result').toPandas()

df2['result'] = df2['result'].apply(lambda x: x[0])

print(classification_report(df2.label, df2.result))
print(accuracy_score(df2.label, df2.result))


              precision    recall  f1-score   support

           0       0.56      0.58      0.57       348
           1       0.51      0.49      0.50       307

    accuracy                           0.54       655
   macro avg       0.54      0.54      0.54       655
weighted avg       0.54      0.54      0.54       655

0.5389312977099237


In [10]:
df2.result.value_counts()

0    358
1    297
Name: result, dtype: int64

In [11]:
df2.label.value_counts()

0    348
1    307
Name: label, dtype: int64