In [1]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName("Lab1_Nepryakhin").setMaster('yarn')

In [None]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [None]:
tripData = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y H:m')\
.csv("trips.txt")

In [None]:
stationData = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y')\
.csv("stations.txt")

In [None]:
stationData.createOrReplaceTempView("stations")
tripData.createOrReplaceTempView("trips")

#### Найти количество велосипедов в системе

In [None]:
endTrips = spark.sql("""
SELECT COUNT(DISTINCT bike_id) AS Kol_vo_velosipedov_v_sisteme
FROM trips
""")
endTrips.show()


#### Найти велосипед с максимальным временем пробега

In [None]:
endTrips = spark.sql("""
SELECT bike_id, SUM(duration) AS duration_sum
FROM trips
    GROUP BY bike_id
    ORDER BY duration_sum DESC
    LIMIT 1 
""")
endTrips.show()

#### Найти наибольшее геодезическое расстояние между станциями

In [None]:
endTrips = spark.sql("""
SELECT from_city, to_city, max(distance_in_km)  as max_dist
    FROM(
    SELECT a.name AS from_city, b.name AS to_city, 
       111.111 *
        DEGREES(ACOS(LEAST(1.0, COS(RADIANS(a.lat))
             * COS(RADIANS(b.lat))
             * COS(RADIANS(a.long - b.long))
             + SIN(RADIANS(a.lat))
             * SIN(RADIANS(b.lat))))) AS distance_in_km
      FROM stations AS a
      JOIN stations AS b ON a.id <> b.id ) AS dist
      GROUP BY from_city, to_city
      ORDER BY max_dist DESC
      limit 1

""")
endTrips.show()


#### Найти путь велосипеда с максимальным временем пробега через станции.

In [None]:
endTrips = spark.sql("""
SELECT id, start_station_name, end_station_name, duration
FROM trips
    WHERE bike_id = 535
    ORDER BY start_date
""")
endTrips.show()

#### Найти пользователей потративших на поездки более 3 часов.

In [None]:
endTrips = spark.sql("""
SELECT 
    zip_code, 
    SUM (duration) AS sum_duration
FROM trips
    GROUP BY zip_code
    HAVING sum_duration > 180
    ORDER BY sum_duration
""")
endTrips.show()
