In [84]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('third-query').getOrCreate()

In [85]:
from functools import reduce
items_df = spark.read.csv('u.item',sep='|')

item_old_columns = items_df.columns
item_new_columns = ['item-id','movie-title','release-date','video-release-data','IMDB-url','unknown','action','adventure','animation',"children's",'comedy','crime','documentary','drama','fantasy','film-noir','horror','musical','mystery','romance','sci-fi','thriller','war','western']


items_df = reduce(lambda df,ids: df.withColumnRenamed(item_old_columns[ids],item_new_columns[ids]),range(len(item_old_columns)),items_df)

In [86]:
items_df.show()

+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|item-id|         movie-title|release-date|video-release-data|            IMDB-url|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|
+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|      1|    Toy Story (1995)| 01-Jan-1995|              null|http://us.imdb.co...|      0|     0|        0|        1|         1|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|
|      2|    GoldenEye (1995)| 01-Jan-1995|     

In [87]:
occupation_df = spark.read.csv('u.occupation').withColumnRenamed("_c0",'occupation')
occupation_df.show()

+-------------+
|   occupation|
+-------------+
|administrator|
|       artist|
|       doctor|
|     educator|
|     engineer|
|entertainment|
|    executive|
|   healthcare|
|    homemaker|
|       lawyer|
|    librarian|
|    marketing|
|         none|
|        other|
|   programmer|
|      retired|
|     salesman|
|    scientist|
|      student|
|   technician|
+-------------+
only showing top 20 rows



In [88]:
user_columns = ['user-id','age','gender','occupation','zip code']
user_df = spark.read.csv('u.user',sep='|')

oldColumns = user_df.columns
user_df = reduce(lambda user_df, ids: user_df.withColumnRenamed(oldColumns[ids], user_columns[ids]), range(len(oldColumns)), user_df)

user_df = user_df.drop('zip code')

In [89]:
user_df.show()

+-------+---+------+-------------+
|user-id|age|gender|   occupation|
+-------+---+------+-------------+
|      1| 24|     M|   technician|
|      2| 53|     F|        other|
|      3| 23|     M|       writer|
|      4| 24|     M|   technician|
|      5| 33|     F|        other|
|      6| 42|     M|    executive|
|      7| 57|     M|administrator|
|      8| 36|     M|administrator|
|      9| 29|     M|      student|
|     10| 53|     M|       lawyer|
|     11| 39|     F|        other|
|     12| 28|     F|        other|
|     13| 47|     M|     educator|
|     14| 45|     M|    scientist|
|     15| 49|     F|     educator|
|     16| 21|     M|entertainment|
|     17| 30|     M|   programmer|
|     18| 35|     F|        other|
|     19| 40|     M|    librarian|
|     20| 42|     F|    homemaker|
+-------+---+------+-------------+
only showing top 20 rows



In [90]:
user_data_columns = ['user-id','item-id','rating','timestamp']
user_data_df = spark.read.csv('u.data',sep="\t")

from functools import reduce
oldColumns = user_data_df.columns

user_data_df = reduce(lambda df,ids: df.withColumnRenamed(oldColumns[ids],user_data_columns[ids]),range(len(oldColumns)),user_data_df)

user_data_df = user_data_df.drop('timestamp')

In [91]:
user_data_df.show()

+-------+-------+------+
|user-id|item-id|rating|
+-------+-------+------+
|    196|    242|     3|
|    186|    302|     3|
|     22|    377|     1|
|    244|     51|     2|
|    166|    346|     1|
|    298|    474|     4|
|    115|    265|     2|
|    253|    465|     5|
|    305|    451|     3|
|      6|     86|     3|
|     62|    257|     2|
|    286|   1014|     5|
|    200|    222|     5|
|    210|     40|     3|
|    224|     29|     3|
|    303|    785|     3|
|    122|    387|     5|
|    194|    274|     2|
|    291|   1042|     4|
|    234|   1184|     2|
+-------+-------+------+
only showing top 20 rows



In [92]:
user_occ_merge_df = user_df.join(occupation_df,on='occupation')

In [93]:
user_occ_merge_df.show()

+-------------+-------+---+------+
|   occupation|user-id|age|gender|
+-------------+-------+---+------+
|   technician|      1| 24|     M|
|        other|      2| 53|     F|
|       writer|      3| 23|     M|
|   technician|      4| 24|     M|
|        other|      5| 33|     F|
|    executive|      6| 42|     M|
|administrator|      7| 57|     M|
|administrator|      8| 36|     M|
|      student|      9| 29|     M|
|       lawyer|     10| 53|     M|
|        other|     11| 39|     F|
|        other|     12| 28|     F|
|     educator|     13| 47|     M|
|    scientist|     14| 45|     M|
|     educator|     15| 49|     F|
|entertainment|     16| 21|     M|
|   programmer|     17| 30|     M|
|        other|     18| 35|     F|
|    librarian|     19| 40|     M|
|    homemaker|     20| 42|     F|
+-------------+-------+---+------+
only showing top 20 rows



In [94]:
item_udata_merge_df = items_df.join(user_data_df,on='item-id')


In [95]:
item_udata_merge_df.show()

+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------+------+
|item-id|         movie-title|release-date|video-release-data|            IMDB-url|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|user-id|rating|
+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------+------+
|    242|        Kolya (1996)| 24-Jan-1997|              null|http://us.imdb.co...|      0|     0|        0|        0|         0|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|    

In [96]:
final_merge_df = item_udata_merge_df.join(user_occ_merge_df,on='user-id')

In [97]:
final_merge_df.show()

+-------+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+---+------+
|user-id|item-id|         movie-title|release-date|video-release-data|            IMDB-url|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|rating|   occupation|age|gender|
+-------+-------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+---+------+
|    196|    242|        Kolya (1996)| 24-Jan-1997|              null|http://us.imdb.co...|      0|     0|        0|        0|         0|     1|    0|          0|    

In [98]:
from pyspark.sql.functions import when

result_df = final_merge_df.select("occupation",
                                when((user_df.age >= 20) & (user_df.age <= 25), "20-25")
                                .when((user_df.age >= 26) & (user_df.age <= 35), "26-35")
                                .when((user_df.age >= 36) & (user_df.age <= 45), "36-45")
                                .otherwise("45+") 
                                .alias("age_group"),'unknown','action','adventure','animation',"children's",'comedy','crime','documentary','drama','fantasy','film-noir','horror','musical','mystery','romance','sci-fi','thriller','war','western','rating')

In [99]:
result_df.show()

+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+
|   occupation|age_group|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|rating|
+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+
|       writer|      45+|      0|     0|        0|        0|         0|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|     3|
|    executive|    36-45|      0|     0|        0|        0|         0|     0|    1|          0|    0|      0|        1|     0|      0|      1|      0|     0|       1|  0|      0|     3|
|       writer|    20-25|      0|     0|        0|        0|     

In [100]:
result_df=result_df.groupBy('occupation','age_group','unknown','action','adventure','animation',"children's",'comedy','crime','documentary','drama','fantasy','film-noir','horror','musical','mystery','romance','sci-fi','thriller','war','western','rating').agg({'rating':'count'})

In [101]:
result_df.show()

+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+
|   occupation|age_group|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|rating|count(rating)|
+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+
|   technician|    26-35|      0|     1|        0|        0|         0|     0|    0|          0|    0|      0|        0|     0|      0|      0|      0|     1|       1|  0|      0|     5|           10|
|administrator|      45+|      0|     0|        0|        0|         0|     0|    0|          0|    0|      0|        0|     0|      0|      1|      0|     0|       1|  0|      0|     4|          

In [102]:
result_df = result_df.orderBy(result_df['occupation'].asc(),result_df['age_group'].asc(),result_df['count(rating)'].desc())

In [103]:
result_df.show()

+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+
|   occupation|age_group|unknown|action|adventure|animation|children's|comedy|crime|documentary|drama|fantasy|film-noir|horror|musical|mystery|romance|sci-fi|thriller|war|western|rating|count(rating)|
+-------------+---------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------+-------------+
|administrator|    20-25|      0|     0|        0|        0|         0|     0|    0|          0|    1|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|     4|           20|
|administrator|    20-25|      0|     0|        0|        0|         0|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|     3|          