In [8]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *

In [9]:
!hdfs dfsadmin -safemode leave

Safe mode is OFF


In [10]:
!hdfs dfs -rm -r ml-latest-small
!hdfs dfs -put ml-latest-small .

rm: `ml-latest-small': No such file or directory
put: `.': No such file or directory: `hdfs://namenode/user/root'


In [11]:
conf = SparkConf().set("spark.executor.instances", "2").set("spark.executor.cores", "1").set("spark.executor.memory", "1g")

In [12]:
spark = SparkSession.builder.config(conf=conf).master(master="yarn").appName("buyantuev_spark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/13 00:38:27 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [13]:
spark

### Скриншоты
Скриншоты находятся в папке /notebooks/images

## Playground

In [21]:
%%time
df = spark.range(int(1e6)).withColumn("value", f.lit("some value"))

CPU times: user 0 ns, sys: 5.24 ms, total: 5.24 ms
Wall time: 184 ms


In [22]:
%%time
df.show()

+---+----------+
| id|     value|
+---+----------+
|  0|some value|
|  1|some value|
|  2|some value|
|  3|some value|
|  4|some value|
|  5|some value|
|  6|some value|
|  7|some value|
|  8|some value|
|  9|some value|
| 10|some value|
| 11|some value|
| 12|some value|
| 13|some value|
| 14|some value|
| 15|some value|
| 16|some value|
| 17|some value|
| 18|some value|
| 19|some value|
+---+----------+
only showing top 20 rows

CPU times: user 2.51 ms, sys: 1.08 ms, total: 3.59 ms
Wall time: 123 ms


## Ratings and Tags

In [23]:
ratings_schema = StructType(fields=[
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("rating", DoubleType()),
    StructField("timestamp", LongType()),
])

ratings_df = spark.read.format("csv").option("header", "True").schema(ratings_schema).load("/ml-latest-small/ratings.csv")

In [24]:
%%time
ratings_df.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows

CPU times: user 5.71 ms, sys: 0 ns, total: 5.71 ms
Wall time: 215 ms


In [27]:
tags_schema = StructType(fields=[
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("tag", StringType()),
    StructField("timestamp", LongType()),
])

tags_df = spark.read.format("csv").option("header", "True").schema(tags_schema).load("/ml-latest-small/tags.csv")

In [28]:
%%time
tags_df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
+------+-------+-----------------+----------+
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|1456948283|
|    18|  52604|  Anthony Hopkins|

                                                                                

## Скриншоты для job-ов
Находятся в папке /images

Имена вида spark_task1_*