In [1]:
from pyspark import SparkConf
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, StructType, StructField, TimestampType

from core.csv_data_manager import CSVDataManager

from metadata.columns import StackExchangeColumns as columns
from metadata.schemas import posts_schema, comments_schema, tag_synonyms_schema, tags_schema

In [2]:
DATA_DIRECTORY_PATH = "D:/PyCharmStorage/BD_2024/data"

In [3]:
spark_session = (SparkSession.builder
                 .master("local")
                 .appName("BD_2024")
                 .config(conf=SparkConf())
                 .getOrCreate())

csv_data_manager = CSVDataManager(spark_session, DATA_DIRECTORY_PATH)

In [4]:
posts_dataframe = csv_data_manager.read(["/Posts_1.csv"], schema=posts_schema)
comments_dataframe = csv_data_manager.read(["/Comments_1.csv", "/Comments_2.csv"], schema=comments_schema)
tag_synonyms_dataframe = csv_data_manager.read(["/TagSynonyms.csv"], schema=tag_synonyms_schema)
tags_dataframe = csv_data_manager.read(["/Tags_1.csv", "/Tags_2.csv"], schema=tags_schema)

In [5]:
posts_dataframe.count()

38912

In [6]:
posts_dataframe.show()

+--------+----------+----------------+--------+-------------------+------------+-----+---------+--------------------+-----------+----------------+----------------+---------------------+-------------------+-------------------+--------------------+--------------------+-----------+------------+-------------+-------------------+------------------+--------------+
|      Id|PostTypeId|AcceptedAnswerId|ParentId|       CreationDate|DeletionDate|Score|ViewCount|                Body|OwnerUserId|OwnerDisplayName|LastEditorUserId|LastEditorDisplayName|       LastEditDate|   LastActivityDate|               Title|                Tags|AnswerCount|CommentCount|FavoriteCount|         ClosedDate|CommunityOwnedDate|ContentLicense|
+--------+----------+----------------+--------+-------------------+------------+-----+---------+--------------------+-----------+----------------+----------------+---------------------+-------------------+-------------------+--------------------+--------------------+-----------

In [7]:
posts_dataframe.describe().show()

+-------+-------------------+------------------+-------------------+--------------------+------------------+-----------------+--------------------+--------------------+----------------+-----------------+---------------------+--------------------+--------------------+------------------+------------------+-------------+--------------+
|summary|                 Id|        PostTypeId|   AcceptedAnswerId|            ParentId|             Score|        ViewCount|                Body|         OwnerUserId|OwnerDisplayName| LastEditorUserId|LastEditorDisplayName|               Title|                Tags|       AnswerCount|      CommentCount|FavoriteCount|ContentLicense|
+-------+-------------------+------------------+-------------------+--------------------+------------------+-----------------+--------------------+--------------------+----------------+-----------------+---------------------+--------------------+--------------------+------------------+------------------+-------------+-----------

In [8]:
posts_dataframe.summary().show()

+-------+-------------------+------------------+-------------------+--------------------+------------------+-----------------+--------------------+--------------------+----------------+-----------------+---------------------+--------------------+--------------------+------------------+------------------+-------------+--------------+
|summary|                 Id|        PostTypeId|   AcceptedAnswerId|            ParentId|             Score|        ViewCount|                Body|         OwnerUserId|OwnerDisplayName| LastEditorUserId|LastEditorDisplayName|               Title|                Tags|       AnswerCount|      CommentCount|FavoriteCount|ContentLicense|
+-------+-------------------+------------------+-------------------+--------------------+------------------+-----------------+--------------------+--------------------+----------------+-----------------+---------------------+--------------------+--------------------+------------------+------------------+-------------+-----------

In [9]:
question_without_tags = posts_dataframe.filter(
    posts_dataframe[columns.tags].isNull() & (posts_dataframe[columns.post_type_id] == 1)
)

In [10]:
question_without_tags.count()

9

In [11]:
posts_dataframe = posts_dataframe.drop_duplicates()
posts_dataframe.count()

38841

In [12]:
posts_dataframe = posts_dataframe.dropna('all')
posts_dataframe.count()

38841

In [13]:
not_question_or_answer = posts_dataframe.filter(~f.col(columns.post_type_id).isin([1, 2]))
not_question_or_answer.count()

34

In [14]:
questions_dataframe = posts_dataframe.filter(
    (f.col(columns.post_type_id) == 1) & (~f.col(columns.tags).isNull())
)
questions_dataframe.count()

18521

In [15]:
answers_dataframe = posts_dataframe.filter(
    (f.col(columns.post_type_id) == 2) & (f.col(columns.tags).isNull())
)
answers_dataframe.count()

20277

In [16]:
comments_dataframe.count()

58318

In [17]:
comments_dataframe.show()

+---------+--------+-----+--------------------+-------------------+---------------+--------+--------------+
|       Id|  PostId|Score|                Text|       CreationDate|UserDisplayName|  UserId|ContentLicense|
+---------+--------+-----+--------------------+-------------------+---------------+--------+--------------+
|137056542|77741822|    0|@ezw yes i tried,...|2024-01-01 16:04:54|           null|23179735|  CC BY-SA 4.0|
|137056543|77740525|    0|Thank you very mu...|2024-01-01 16:05:02|           null|16114382|  CC BY-SA 4.0|
|137056544|77742261|    0|"I am able to see...|2024-01-01 16:05:20|           null|  861716|  CC BY-SA 4.0|
|137056546|72658808|    0|Would it be bette...|2024-01-01 16:05:56|           null| 3260008|  CC BY-SA 4.0|
|137056547|77742464|    0|I found out simil...|2024-01-01 16:06:35|           null|12018173|  CC BY-SA 4.0|
|137056548|77742409|    1|It won't retain t...|2024-01-01 16:06:37|           null| 7582247|  CC BY-SA 4.0|
|137056549|77740912|    0|Yo

In [18]:
comments_dataframe.describe().show()

+-------+--------------------+--------------------+-------------------+--------------------+---------------+-----------------+--------------+
|summary|                  Id|              PostId|              Score|                Text|UserDisplayName|           UserId|ContentLicense|
+-------+--------------------+--------------------+-------------------+--------------------+---------------+-----------------+--------------+
|  count|               58318|               58318|              58318|               58318|            220|            58098|         58318|
|   mean|1.3710182760706815E8| 7.374122717363422E7|0.27082547412462704|                null|           null|8190080.357000241|          null|
| stddev|   28219.87327927787|1.2896410434211098E7| 0.7203696474689596|                null|           null| 7584051.04121727|          null|
|    min|           137053436|                 623|                  0|!Now I disable a ...|   user11631308|               -1|  CC BY-SA 4.0|
|    m

In [19]:
comments_dataframe.summary().show()

+-------+--------------------+--------------------+-------------------+--------------------+---------------+-----------------+--------------+
|summary|                  Id|              PostId|              Score|                Text|UserDisplayName|           UserId|ContentLicense|
+-------+--------------------+--------------------+-------------------+--------------------+---------------+-----------------+--------------+
|  count|               58318|               58318|              58318|               58318|            220|            58098|         58318|
|   mean|1.3710182760706815E8| 7.374122717363422E7|0.27082547412462704|                null|           null|8190080.357000241|          null|
| stddev|   28219.87327927787|1.2896410434211098E7| 0.7203696474689596|                null|           null| 7584051.04121727|          null|
|    min|           137053436|                 623|                  0|!Now I disable a ...|   user11631308|               -1|  CC BY-SA 4.0|
|    2

In [20]:
tag_synonyms_dataframe.count()

5895

In [21]:
tag_synonyms_dataframe.describe().show()

+-------+------------------+------------------+-------------+-----------------+------------------+------------------+------------------+
|summary|                Id|     SourceTagName|TargetTagName|      OwnerUserId|   AutoRenameCount|             Score|  ApprovedByUserId|
+-------+------------------+------------------+-------------+-----------------+------------------+------------------+------------------+
|  count|              5895|              5895|         5895|             5779|              5895|              5895|              5269|
|   mean| 5221.687362171331|          4584.185|          3.0| 1944282.62502163|460.77319762510604|1.1888040712468193|1541889.5312203455|
| stddev|2539.4878271606967|3874.7964273429902|         null|3145492.907472555|2751.4532084008333|1.6882425947780475|2882228.6606160607|
|    min|                 3|             .ajax|    .htaccess|               -1|                 0|                -1|                -1|
|    max|              9133|             

In [22]:
tag_synonyms_dataframe.summary().show()

+-------+------------------+------------------+-------------+-----------------+------------------+------------------+------------------+
|summary|                Id|     SourceTagName|TargetTagName|      OwnerUserId|   AutoRenameCount|             Score|  ApprovedByUserId|
+-------+------------------+------------------+-------------+-----------------+------------------+------------------+------------------+
|  count|              5895|              5895|         5895|             5779|              5895|              5895|              5269|
|   mean| 5221.687362171331|          4584.185|          3.0| 1944282.62502163|460.77319762510604|1.1888040712468193|1541889.5312203455|
| stddev|2539.4878271606967|3874.7964273429902|         null|3145492.907472555|2751.4532084008333|1.6882425947780475|2882228.6606160607|
|    min|                 3|             .ajax|    .htaccess|               -1|                 0|                -1|                -1|
|    25%|              3215|            8

In [23]:
tags_dataframe.count()

65994

In [24]:
tags_dataframe.show()

+------+--------------------+-----+-------------+----------+---------------+----------+
|    Id|             TagName|Count|ExcerptPostId|WikiPostId|IsModeratorOnly|IsRequired|
+------+--------------------+-----+-------------+----------+---------------+----------+
|100002|          glowscript|   17|     21239508|  21239507|           null|      null|
|100005|bootstrap-datetim...|  747|     29579877|  29579876|           null|      null|
|100007|       computercraft|  145|     21193895|  21193894|           null|      null|
|100008|         dapper-lite|    1|     21234038|  21234037|           null|      null|
|100010|          repast-hpc|   18|     21206486|  21206485|           null|      null|
|100011|               weber|    1|     21234197|  21234196|           null|      null|
|100012|       fitnesse-slim|  172|     21207710|  21207709|           null|      null|
|100013|              utfcpp|    2|     21197326|  21197325|           null|      null|
|100014| hyperbolic-function|   