In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

In [0]:
df_twit_h = spark.read.format("csv").option("parserLib", "univocity").option("header", "true").option("delimiter", ",").load("HashtagTwitterPOS.csv")
df_twit_a = spark.read.format("csv").option("parserLib", "univocity").option("header", "true").option("delimiter", ",").load("AkunTwitterPOS.csv")

In [0]:
df_ig = spark.read.format("json").load("posindonesiaIG.json")

In [0]:
df_ig.printSchema()

In [0]:
def FlatDF(schema, prefix=None):
        fields = []
        for field in schema.fields:
            name = prefix + '.' + field.name if prefix else field.name
            dtype = field.dataType
            if isinstance(dtype, ArrayType):
                dtype = dtype.elementType

            if isinstance(dtype, StructType):
                fields += FlatDF(dtype, prefix=name)
            else:
                fields.append(name)

        return fields

In [0]:
df_ig = df_ig.select(FlatDF(df_ig.schema))

In [0]:
df_ig.printSchema()
df_ig.show()

In [0]:
df_ig.select(col("author")[1], col("comment")[1]).show()

In [0]:
df_ig_clean = df_ig.withColumn("tmp", arrays_zip("author", "comment")) \
          .withColumn("tmp", explode("tmp")) \
          .select(col("tmp.author"), col("tmp.comment"))

In [0]:
df_ig_clean = df_ig_clean.withColumn("source", lit("Instagram"))
df_ig_clean.show()

In [0]:
df_twit_a_clean = df_twit_a.select("username", "tweet", lit("Twitter").alias("source"))
df_twit_a_clean.show()

In [0]:
df_twit_h_clean = df_twit_h.select("username", "tweet", lit("Twitter").alias("source"))
df_twit_h_clean.show()

In [0]:
df_ig_clean = df_ig_clean.select(col("author").alias("username"), col("comment").alias("content"), "source")
df_twit_a_clean = df_twit_a_clean.select("username", col("tweet").alias("content"), "source")
df_twit_h_clean = df_twit_h_clean.select("username", col("tweet").alias("content"), "source")

In [0]:
from functools import reduce
from pyspark.sql import DataFrame

dfs = [df_ig_clean, df_twit_a_clean, df_twit_h_clean]

df_union = reduce(DataFrame.unionAll, dfs)

In [0]:
for c, t in df_union.dtypes:
        if t == "string":
            df_union = df_union.withColumn(c, regexp_replace(c, "[^\w\s]", ""))

In [0]:
df_union.coalesce(1).write.format("csv").mode("overwrite").option("header", "true").save('union_data.csv')


In [0]:
df_union.show()