<a href="https://colab.research.google.com/github/ccheitor/IntroPySpark/blob/main/Nbk_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preparação do Ambiente

In [None]:
pip install pyspark & pip install wget



In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession \
  .builder \
  .appName(" Execução do trabalho inyterativo")\
  .getOrCreate()

spark.version

'3.5.0'

In [None]:
df_titles = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/Spark/title_basics.tsv', header=True,
inferSchema=True, sep='\t')


In [None]:
#df_titles.printSchema()
df_titles.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [None]:
df_ratings = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/Spark/title_ratings.tsv', header=True,
inferSchema=True, sep='\t')

In [None]:
df_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



In [None]:
df_ratings.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1809|
|tt0000002|          6.0|     233|
|tt0000003|          6.5|    1560|
|tt0000004|          6.1|     152|
|tt0000005|          6.2|    2383|
|tt0000006|          5.1|     157|
|tt0000007|          5.4|     746|
|tt0000008|          5.5|    1965|
|tt0000009|          5.8|     189|
|tt0000010|          6.9|    6530|
|tt0000011|          5.2|     323|
|tt0000012|          7.4|   11188|
|tt0000013|          5.8|    1726|
|tt0000014|          7.1|    5023|
|tt0000015|          6.2|     947|
|tt0000016|          5.9|    1325|
|tt0000017|          4.6|     292|
|tt0000018|          5.3|     537|
|tt0000019|          5.3|      28|
|tt0000020|          5.0|     315|
+---------+-------------+--------+
only showing top 20 rows



In [None]:
# Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?
#df_titles[df_titles.startYear.isin("2015")].show()
df_titles.filter(df_titles.startYear =="2015").distinct().count()

358054

In [None]:
df_titles.filter(df_titles.startYear =="2015").count()

358054

In [None]:
df_filtrado =df_titles.select("tconst","startYear").filter(df_titles.startYear =="2015")

In [None]:
df_filtrado.distinct().count()

358054

In [None]:
df_joined = df_titles.join(df_ratings, on='tconst', how='inner')

In [None]:
df_joined.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+
|tt0000008|    short|Edison Kinetoscop...|Edison Kinetoscop...|      0|     1894|     \N|             1|   Documentary,Short|          5.5|    1965|
|tt0000015|    short| Autour d'une cabine| Autour d'une cabine|      0|     1894|     \N|             2|     Animation,Short|          6.2|     947|
|tt0000019|    short|    The Clown Barber|    The Clown Barber|      0|     1898|     \N|            \N|        Comedy,Short|          5.3|      28|
|tt0000051|    short|The Bohemian Enca...|Campement de bohé...|      0|     1896|     \N|            \N|  

In [None]:
df_joined.filter(df_joined.startYear=="2015").distinct().count()

48810

In [None]:
df_titles.select('primaryTitle').distinct().count()

3931670

In [None]:
df_titles.filter((df_titles.titleType == 'movie') & (df_titles.startYear == 2015)).count()

16429

In [None]:
# Filtra os registros para incluir apenas filmes com conteúdo adulto (isAdult == 1)
df_adult_movies = df_titles.filter(df_titles.isAdult == 1)

# Calcula as estatísticas descritivas, incluindo a média da coluna runtimeMinutes
avg_runtime = df_adult_movies.describe(['runtimeMinutes']).filter("summary == 'mean'").select('runtimeMinutes').collect()[0][0]

print("A duração média dos filmes com conteúdo adulto é:", avg_runtime, "minutos.")


A duração média dos filmes com conteúdo adulto é: 92.79938555059914 minutos.


In [None]:

# Filtra os registros para incluir apenas os filmes com títulos diferentes
df_diff_titles = df_titles.filter(df_titles.primaryTitle != df_titles.originalTitle)

# Conta o número de filmes com títulos diferentes
count_diff_titles = df_diff_titles.count()

print("O número de filmes com título atual diferente do título original é:", count_diff_titles)


O número de filmes com título atual diferente do título original é: 125056


In [None]:
from pyspark.sql.functions import length,col

# Adicione uma nova coluna 'titleLength' para armazenar o comprimento do título
df_titles_with_length = df_titles.withColumn('titleLength', length(col('primaryTitle')))

# Ordene o DataFrame em ordem decrescente de comprimento do título
df_longest_title = df_titles_with_length.orderBy('titleLength', ascending=False).limit(1)

# Exiba o filme com o nome mais longo
df_longest_title.show()


+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-----------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|titleLength|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-----------+
|tt12985206|    video|Otoko wa chi _ ko...|Otoko wa chi _ ko...|      1|     2020|     \N|            99| Adult|        419|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-----------+



In [None]:
# Ordene o DataFrame em ordem decrescente pela quantidade de votos e pegue o primeiro registro
df_most_voted = df_ratings.orderBy('numVotes', ascending=False).limit(1)

# Exiba o filme com a maior quantidade de votos
df_most_voted.show()


+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0111161|          9.3| 2449517|
+---------+-------------+--------+



In [None]:
# Ordene o DataFrame em ordem decrescente pela quantidade de votos e pegue o primeiro registro
df_most_voted = df_ratings.orderBy('averageRating', ascending=True).limit(1)

# Exiba o filme com a maior quantidade de votos
df_most_voted.show()


+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt1794292|          1.0|      82|
+---------+-------------+--------+

