In [1]:
!pip install spark-nlp==1.7.3



In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StringType, DataType
import pandas as pd
from pyspark.sql.functions import udf,col
from pyspark.sql import functions as F
pd.set_option('max_colwidth', 800)
import matplotlib
from pyspark.sql.types import TimestampType, StructType, StructField, StringType, FloatType, IntegerType, LongType
from pyspark.sql import types as T
from datetime import datetime

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName(" Sparkify Localy") \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2") \
    .getOrCreate()
spark

In [4]:
# get parameters
spark.sparkContext.getConf().getAll()

[('spark.app.name', ' Sparkify Localy'),
 ('spark.jars.packages', 'JohnSnowLabs:spark-nlp:1.8.2'),
 ('spark.repl.local.jars',
  'file:///home/anthelix/.ivy2/jars/JohnSnowLabs_spark-nlp-1.8.2.jar,file:///home/anthelix/.ivy2/jars/com.typesafe_config-1.3.0.jar,file:///home/anthelix/.ivy2/jars/org.rocksdb_rocksdbjni-5.17.2.jar,file:///home/anthelix/.ivy2/jars/org.apache.hadoop_hadoop-aws-2.7.3.jar,file:///home/anthelix/.ivy2/jars/com.amazonaws_aws-java-sdk-1.7.4.jar,file:///home/anthelix/.ivy2/jars/org.tensorflow_tensorflow-1.8.0.jar,file:///home/anthelix/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///home/anthelix/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///home/anthelix/.ivy2/jars/net.sf.trove4j_trove4j-3.0.3.jar,file:///home/anthelix/.ivy2/jars/commons-logging_commons-logging-1.1.3.jar,file:///home/anthelix/.ivy2/jars/org.apache.httpcomponents_httpclient-4.2.5.jar,file:///home/anthelix/.ivy2/jars/joda-time_joda-time-2.10.5.jar,file:///home/anthelix/.ivy2/ja

In [5]:
log_data_path = "./log_data/*.json"
df_log = spark.read.json(log_data_path)
df_log
print('DataFrame rows: %d' % df_log.count())
df_log.printSchema()
print('DataFrame schema: %s' % df_log)
df_log.take(2)

DataFrame rows: 8056
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

DataFrame schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: bigint, lastName: string, length: double, level: string, location: string, method: string, page: string, registration: double, sessionId: bigint, song: string, status: bigint

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', itemInSession=1, lastName='Smith', length=260.07465, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='The Big Gundown', status=200, ts=1542242481796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26')]

In [6]:
log_data_path = "./log_data/*.json"
log_schema = StructType([
    StructField("artist", StringType()),
    StructField("auth", StringType()),
    StructField("firstName", StringType()),
    StructField("gender", StringType()),
    StructField("itemInSession", IntegerType()),
    StructField("lastName", StringType()),
    StructField("length", FloatType()),    
    StructField("level", StringType()),
    StructField("location", StringType()),
    StructField("method", StringType()),
    StructField("page", StringType()),
    StructField("registration", FloatType()),
    StructField("sessionId", StringType()),
    StructField("song", StringType()),
    StructField("status", IntegerType()),
    StructField("ts", LongType()),
    StructField("userAgent", StringType()),
    StructField("userId", IntegerType())
])
df_logS = spark.read.json(log_data_path, schema=log_schema)
df_logS
print('DataFrame rows: %d' % df_logS.count())
df_logS.printSchema()
print('DataFrame schema: %s' % df_logS)
df_logS.take(2)

DataFrame rows: 8056
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: float (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: float (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: integer (nullable = true)

DataFrame schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: int, lastName: string, length: float, level: string, location: string, method: string, page: string, registration: float, sessionId: string, song: string, status: int,

[Row(artist=None, auth=None, firstName=None, gender=None, itemInSession=None, lastName=None, length=None, level=None, location=None, method=None, page=None, registration=None, sessionId=None, song=None, status=None, ts=None, userAgent=None, userId=None),
 Row(artist=None, auth=None, firstName=None, gender=None, itemInSession=None, lastName=None, length=None, level=None, location=None, method=None, page=None, registration=None, sessionId=None, song=None, status=None, ts=None, userAgent=None, userId=None)]