In [1]:
!pip install pyspark



In [29]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, date_format, count, when

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df_video = spark.read.csv('videos-stats.csv', header = True, inferSchema= True)

In [5]:
colunas = ['Likes', 'Comments', 'Views']

for coluna in colunas:
  df_video = df_video.na.fill({coluna:0})


In [6]:
df_comentario = spark.read.csv('comments.csv', header = True, inferSchema= True, escape='"', quote='"', multiLine=True)


In [7]:
print('Quantidade de registros em df_video:', df_video.count())
print('Quantidade de registros em df_comentario:',df_comentario.count())

Quantidade de registros em df_video: 1881
Quantidade de registros em df_comentario: 18409


In [8]:
df_video_IDvazio = df_video.filter(col('Video ID').isNull())
print('Quantidade de registros em df_video com video_id vazio:', df_video_IDvazio.count())

df_comentario_IDvazio = df_comentario.filter(col('Video ID').isNull())
print('Quantidade de registros em df_comentario com video_id vazio:', df_comentario_IDvazio.count())

Quantidade de registros em df_video com video_id vazio: 0
Quantidade de registros em df_comentario com video_id vazio: 0


In [9]:
print('Quantidade de registros em df_video:', df_video.count())
print('Quantidade de registros em df_comentario:',df_comentario.count())

Quantidade de registros em df_video: 1881
Quantidade de registros em df_comentario: 18409


In [11]:
df_video = df_video.dropDuplicates(subset=['Video ID'])
print('Quantidade de registros que restou após a retirada dos duplicados em Video_ID em df_video: ', df_video.count())

Quantidade de registros que restou após a retirada dos duplicados em Video_ID em df_video:  1869


In [12]:
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: double (nullable = false)
 |-- Comments: double (nullable = false)
 |-- Views: double (nullable = false)



In [13]:
df_video = df_video.withColumn('Likes', col('Likes').cast('int'))
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: double (nullable = false)
 |-- Views: double (nullable = false)



In [14]:
df_video = df_video.withColumn('Comments', col('Comments').cast('int'))
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: double (nullable = false)



In [15]:
df_video = df_video.withColumn('Views', col('Views').cast('int'))
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)



In [16]:
df_comentario.printSchema()
df_comentario = df_comentario.withColumn('Likes', col('Likes').cast('int'))
df_comentario.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes: double (nullable = true)
 |-- Sentiment: double (nullable = true)

root
 |-- _c0: integer (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Sentiment: double (nullable = true)



In [17]:
df_comentario = df_comentario.withColumn('Sentiment', col('Sentiment').cast('int'))
df_comentario.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [18]:
df_comentario = df_comentario.withColumnsRenamed({'Likes':'Likes Comment'})
df_comentario.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [19]:
df_video = df_video.withColumn('Interaction', col('Likes') + col('Comments') + col('Views'))
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)



In [20]:
df_video.show()
df_video.printSchema()

+----+--------------------+-----------+------------+----------------+------+--------+---------+-----------+
| _c0|               Title|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|
+----+--------------------+-----------+------------+----------------+------+--------+---------+-----------+
| 986|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|         mukbang|378858|   18860| 17975269|   18372987|
|  71|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|            news|  6379|    4853|   808787|     820019|
|  48|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|            news|  1029|    2347|    97434|     100810|
| 993|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|         mukbang| 45628|   17264|  5283664|    5346556|
|1456|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|         physics| 10959|     525|   844015|     855499|
| 949|Eating ONLY KOREA...|-7hzaGya86g|  2022-07-17|            food| 18957|     566|   702174|     721697|
| 240|19 Year-Old Start...|-

In [21]:
df_video = df_video.withColumn("Year", date_format(col("Published At"), "yyyy"))
df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)



In [22]:
join_df_video_comments = df_video.join(df_comentario, 'Video ID')
join_df_video_comments.show()

+-----------+---+--------------------+------------+-------+-----+--------+-------+-----------+----+---+--------------------+-------------+---------+
|   Video ID|_c0|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|_c0|             Comment|Likes Comment|Sentiment|
+-----------+---+--------------------+------------+-------+-----+--------+-------+-----------+----+---+--------------------+-------------+---------+
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  0|Let's not forget ...|           95|        1|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  1|Here in NZ 50% of...|           19|        0|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  2|I will forever ac...|          161|        2|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  3|Whe

In [23]:
df_us_videos = spark.read.csv('/content/USvideos.csv', header = True, inferSchema= True)
df_us_videos.printSchema()
df_us_videos.show(10)

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               t

In [24]:
df_join_video_usvideos = df_video.join(df_us_videos, 'title')
df_join_video_usvideos.show()

+--------------------+----+-----------+------------+----------------+------+--------+---------+-----------+----+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|               Title| _c0|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|   video_id|trending_date|    channel_title|category_id|        publish_time|                tags|   views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+--------------------+----+-----------+------------+----------------+------+--------+---------+-----------+----+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+------------

In [25]:
df_video.count()

1869

In [30]:
df_video_contanulos = df_video.select([count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns])
df_video_contanulos.show()

+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|  0|    0|       0|           0|      0|    0|       0|    0|          0|   0|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+



In [None]:
df_video = df_video.drop("_c0")
df_video.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)



In [None]:
join_df_video_comments = join_df_video_comments.drop("_c0")
join_df_video_comments.printSchema()

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [None]:
join_df_video_comments.write.mode('overwrite').option('header', 'true').parquet('videos-comments-tratados-parquet')

In [None]:
df_video.write.mode('overwrite').option('header', 'true').parquet('videos-tratados-parquet')