### Import Libs

In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

### Spark Session

In [2]:
spark = (SparkSession.builder
                     .master("local[*]")
                     .appName("TrabMod2")
                     .getOrCreate())

In [3]:
spark.conf.set("spark.sql.shuffle.partitions", 8)

### Read File `title_basics`

In [4]:

df_title_basics = (
                    spark.read
                         .format("csv")
                         .option("header", "true")
                         .option("sep", "\t")
                         .option("inferSchema", "true")
                         .load("D:/XPE/trabalhoMod2/dados/title_basics.tsv")
                  )

df_title_basics.printSchema()                  

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



### Read File `title_ratings`

In [5]:


df_title_ratings = (
                     spark.read
                          .format("csv")
                          .option("header", "true")
                          .option("sep", "\t")
                          .option("inferSchema", "true")
                          .load("D:/XPE/trabalhoMod2/dados/title_ratings.tsv")
                   )

df_title_ratings.printSchema()   

root
 |-- tconst: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



#### Join entre os 2 dataframes

In [6]:
typeJoin = "left"
keyJoin  = "tconst"

df_join = df_title_basics.join(broadcast(df_title_ratings), keyJoin, typeJoin)

#### Print Schema

In [7]:
df_join.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



#### Count 

In [8]:
print("{0:,}".format(df_join.count()).replace(",","."))

8.203.690


### Função para display mais amigável

In [9]:
def display_func(df, rows):
    df = df.pandas_api()
    return df.head(rows)

#### Display limit 5 

In [10]:
display_func(df_join, 5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1809
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",6.0,233
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1560
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",6.1,152
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2383


### Pergunta 4
### Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?

In [11]:
df_ano_2015 = (df_join.filter(" titleType in ('tvMovie','movie') AND startYear == '2015' ")
                      .distinct()
                      .count())

print("{0:,}".format(df_ano_2015).replace(",","."))

### 19.987

19.987


#### Pergunta 5
#### Qual o gênero de títulos mais frequente?

In [12]:
df_genres_mais_freq = (df_join.select(explode_outer(split("genres", ",")).alias("genres"))
                              .groupBy("genres")
                              .count()
                              .orderBy(col("count").desc()))

display_func(df_genres_mais_freq, 1)

Unnamed: 0,genres,count
0,Drama,2247995


#### Pergunta 6
#### Qual o gênero com a melhor nota média de títulos

In [13]:
df_genres_avg = (df_join.filter(" genres != 'None' ")
                        .select("averageRating", explode_outer(split("genres", ",")).alias("genres"))
                        .groupBy("genres")
                        .agg(avg("averageRating").alias("averageRating"))
                        .orderBy(col("averageRating").desc()))

display_func(df_genres_avg, 1)

Unnamed: 0,genres,averageRating
0,History,7.35378


#### Pergunta 7
#### Qual o vídeo game do gênero aventura mais bem avaliado em 2020?

In [14]:
df_genres_mais_avaliado = (df_join.withColumn("genres", explode_outer(split("genres", ",")))
                                  .filter(" startYear == '2020' AND \
                                            genres == 'Adventure' AND \
                                            titleType == 'videoGame' "))
                                            
df_resunt_fim = (df_genres_mais_avaliado.orderBy(col("averageRating").desc())
                                        .select("primaryTitle","averageRating"))

display_func(df_resunt_fim, 1)

Unnamed: 0,primaryTitle,averageRating
0,Half-Life: Alyx,9.5


#### Pergunta 9
#### Quantos títulos de filmes diferentes existem? Use

In [15]:
df_fimes_dif = (df_join.select('primaryTitle')
                       .distinct()
                       .count())
       
print("{0:,}".format(df_fimes_dif).replace(",","."))

3.931.670


#### Pergunta 10
#### Qual a duração média dos filmes com conteúdo adulto?
#### Use uma combinação de filter() e describe()

In [16]:
df_adult = (df_join.filter(" isAdult == '1' ")
                   .select("runtimeMinutes")
                   .describe()
                   .filter(" summary  == 'mean' "))
                                            
display_func(df_adult, 10)

Unnamed: 0,summary,runtimeMinutes
0,mean,92.79938555059914


#### Pergunta 11
#### Quantos filmes têm o título atual (“primary”) diferente do título original? 
#### Use uma combinação de filter, e count()

In [17]:
df_dif = (df_join.filter(" primaryTitle != originalTitle")
                 .count())

print("{0:,}".format(df_dif).replace(",","."))

125.056


#### Pergunta 12
#### Qual o filme que tem o nome mais longo?
#### Dica: consulte https://sparkbyexamples.com/spark/spark-using-length-size-of-adataframe-column/ (https://sparkbyexamples.com/spark/spark-using-length-size-of-adataframe-column/) e use algo como
#### df_titles.orderBy(length(col("primaryTitle")).desc()).

In [18]:
(
  display_func(df_join.select("tconst","primaryTitle")
                      .orderBy(length("primaryTitle").desc())
                      .select("tconst")
               , 1)
)

Unnamed: 0,tconst
0,tt12985206


#### Pergunta 13
#### Qual filme tem a maior quantidade de votos? Dica: Use describe().


In [19]:
(
    display_func(df_join.groupBy("tconst","numVotes")
                        .agg(max("numVotes").alias("max_numVotes"))
                        .orderBy(col("max_numVotes").desc())
                        .select("tconst", "max_numVotes")
                 , 1)
)

Unnamed: 0,tconst,max_numVotes
0,tt0111161,2449517


#### Pergunta 15
#### Qual é a menor nota média de um filme? Use describe().

In [20]:
display_func(df_join.select("averageRating")
                    .describe()
                    .filter(" summary == 'min' ")
             , 10)

Unnamed: 0,summary,averageRating
0,min,1.0
