In [22]:
DATASET_ENCODING = "ISO-8859-1"
TEST_SIZE = 0.2
TRAIN_SIZE = 0.8
VAL_SIZE = 0.2
VOCABULARY_SIZE = 5000
DATA_DIR = 'data'
FEATURES_NUMBER = 50

In [13]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.sql.functions import when
import utils
import numpy as np
import nltk
import pandas as pd
import os
import re

In [3]:
#creating spark session
spark = SparkSession.builder.appName('sentimentAnalysis').getOrCreate()

In [4]:
def _get_schema_structure():
    
    #creating schema for dataset
    data_schema = [
        StructField("target", IntegerType(), True),
        StructField("ids", LongType(), True),
        StructField("date", StringType(), True),
        StructField("flag", StringType(), True),
        StructField("user", StringType(), True),
        StructField("text", StringType(), True)
    ]
    return StructType(fields=data_schema)

In [14]:


#readind dataset
df=spark.read.schema(_get_schema_structure()).option('encoding', DATASET_ENCODING).option('header','false').csv(os.path.join(DATA_DIR, 'training_1600000_processed_noemoticon.csv'))

In [6]:
df.show(5)

+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



much better if I have values from 0 to 1 in order to calculate loss afterwards

In [15]:
targetDf = df.withColumn("target",
              when(df["target"] == 4, 1).otherwise(df["target"]))

In [16]:
targetDf.show(5)

+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [17]:
targetDf.groupBy('target').count().show()

+------+------+
|target| count|
+------+------+
|     1|800000|
|     0|800000|
+------+------+



splitting train and test data

In [23]:
train_df, test = targetDf.randomSplit([TRAIN_SIZE, TEST_SIZE], seed=2)
train, val = train_df.randomSplit([TRAIN_SIZE, VAL_SIZE], seed=2)

checking if the values for target are balanced

In [24]:
train.groupBy('target').count().show()

+------+------+
|target| count|
+------+------+
|     1|511913|
|     0|512823|
+------+------+



In [25]:
%%time

from transformers.PreProcessingGeneral import PreProcessingGeneral
from transformers.LemmatizeStemStopWords import LemmatizeStemStopWords
from transformers.NegateSequence import NegateSequence
from transformers.PosProcessingGeneral import PosProcessingGeneral
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer, NGram


# TODO add speling correction: Levenshtein Distance
#vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = VOCABULARY_SIZE, use_idf = False)
pre_processing_general = PreProcessingGeneral(inputCol="text", outputCol="data_prep")
lemmatize_stem_stop_words = LemmatizeStemStopWords(inputCol="data_prep", outputCol="data_prep")
negate_sequence = NegateSequence(inputCol="data_prep", outputCol="data_prep")
pos_processing_general = PosProcessingGeneral(inputCol="data_prep", outputCol="data_prep")
ngram = NGram(n=2, inputCol="data_prep", outputCol="ngrams")
count_vectorizer = CountVectorizer(inputCol="ngrams", outputCol="count_vect").setVocabSize(15000)
idf = IDF(inputCol="count_vect", outputCol="features")

pipeline = Pipeline(stages = [pre_processing_general, 
                              lemmatize_stem_stop_words, 
                              negate_sequence, 
                              pos_processing_general,
                              ngram,
                              count_vectorizer,
                              idf
                             ])

# Fit the pipeline to training documents.
data_prep = pipeline.fit(train)

CPU times: user 81 ms, sys: 34 ms, total: 115 ms
Wall time: 7min 33s


In [26]:
%%time

data = data_prep.transform(train)

CPU times: user 37.8 ms, sys: 15.5 ms, total: 53.3 ms
Wall time: 165 ms


reducing dimensionality

In [29]:
%%time

from pyspark.ml.feature import PCA

pca = PCA(k = 10000, inputCol = 'features', outputCol = 'pcaFeatures')
model = pca.fit(data)

Py4JJavaError: An error occurred while calling o679.fit.
: org.apache.spark.SparkException: Job 36 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:932)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:930)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:930)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2128)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2041)
	at org.apache.spark.SparkContext$$anonfun$stop$6.apply$mcV$sp(SparkContext.scala:1949)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1948)
	at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:575)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeColumnSummaryStatistics(RowMatrix.scala:433)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeCovariance(RowMatrix.scala:348)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computePrincipalComponentsAndExplainedVariance(RowMatrix.scala:401)
	at org.apache.spark.mllib.feature.PCA.fit(PCA.scala:53)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:99)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 61149)
Traceback (most recent call last):
  File "/Users/carolinaabs/anaconda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/carolinaabs/anaconda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/Users/carolinaabs/anaconda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/carolinaabs/anaconda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/Users/carolinaabs/anaconda/lib/python3.6/site-packages/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/Users/carolinaabs/anaconda/lib/python3.6/site-packages/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/Users/carolinaabs/a

In [35]:
from pyspark.mllib.regression import RidgeRegressionModel, RidgeRegressionWithSGD
import numpy as np
lrm = RidgeRegressionWithSGD.train(data, iterations=10, initialWeights=np.array([1.0]))

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:61093)
Traceback (most recent call last):
  File "/Users/carolinaabs/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/carolinaabs/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:61093)

In [None]:
df_pca = model.transform(data)

In [None]:
df_pca.select('target','pcaFeatures')

In [None]:
dfComp.show()

In [None]:
data_test = data_prep.transform(dframe_test)

In [None]:
data.show()

In [None]:
data.select('features').show()

In [None]:
data_test.printSchema()

### Test cells

Let's start by testing out your code.

In [None]:
# importing tests
import problem_unittests as tests
# informal testing, print out the results of a called function
# create new `transformed_df`
transformed_df = numerical_dataframe()


# test numerical_dataframe function
tests.test_numerical_df(spark)



# check work
get_stop_words_list()
print('\nExample data: ')
transformed_df.head()

In [None]:
# Create dataframe
'''creates a dataset for tests'''
r1 = Row(0, 123, 'Monday 2019', '@test', 'carocat', "studying I don't want b")
r2 = Row(0, 123, 'Monday 2019', '@test', 'carocat', 'not like :-) Deep learning is so difficult :-/')
r3 = Row(0, 123, 'Monday 2019', '@test', 'carocat', ':-) Spark is fun')
row_Seq = [r1, r2, r3]
dframe = spark.createDataFrame(row_Seq, _get_schema_structure())

In [None]:
'''creates a dataset for tests'''
r3 = Row(0, 123, 'Monday 2019', '@test', 'carocat', ':-) Spark is fun')
row_Seq = [r3]
dframe_test = spark.createDataFrame(row_Seq, _get_schema_structure())

In [None]:
first = dframe.withColumn("pre_proc", lemmatize_stem_stop_words_udf(dframe.text))

In [None]:
first.show()

In [None]:
pos_processing(" i don't want bla bla happy i am doing some ML! ")

In [None]:
lemmatize_stem_stop_words(" I don't studying !!!! But I loved flowers programming programmers ")

In [None]:
from transformers.PreProcessingGeneral import PreProcessingGeneral
from transformers.LemmatizeStemStopWords import LemmatizeStemStopWords
from transformers.NegateSequence import NegateSequence
from transformers.PosProcessingGeneral import PosProcessingGeneral

pre_processing_general = PreProcessingGeneral(inputCol="text", outputCol="data_prep")
lemmatize_stem_stop_words = LemmatizeStemStopWords(inputCol="data_prep", outputCol="data_prep")
negate_sequence = NegateSequence(inputCol="data_prep", outputCol="data_prep")
pos_processing_general = PosProcessingGeneral(inputCol="data_prep", outputCol="data_prep")

