In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import json
import scipy 
import torch
from torch.utils.data import Dataset, DataLoader
import logging
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease
                                                                               Hit:5 http://ppa.launchpad.

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-08-30 01:30:52--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-08-30 01:30:52 (5.19 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("UA_War").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
#import data from aws s3 bucket for 1 day. This will be the starting dataframe to populate DB

from pyspark import SparkFiles
start_date = "0401"

url = f"https://databootcamps3bucket.s3.us-west-2.amazonaws.com/ua_war/UkraineWar/{start_date}_UATweets.csv.gz"
spark.sparkContext.addFile(url)

df_start_date = spark.read.option("delimiter", ",").option("encoding", "UTF-8").option("multiLine", True).option("escape", '"').csv(SparkFiles.get(f"{start_date}_UATweets.csv.gz"),  header=True, inferSchema=True)

# df.show(truncate=False) 
df_start_date_2 = df_start_date.na.fill("<empty>")

df_start_date_2.show(5)

+---+-------------------+---------------+--------------------+--------------------+---------+---------+-----------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----------+--------------+--------------------+
|_c0|             userid|       username|            acctdesc|            location|following|followers|totaltweets|       usercreatedts|            tweetid|      tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|         extractedts|
+---+-------------------+---------------+--------------------+--------------------+---------+---------+-----------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----------+--------------+--------------------+
|  0|           16882774|        Yaniela|Animal lover, sup...|              Hawaii|     1158|      392|      88366|2008-10-21 07:3

In [5]:
#see what the datatypes for df_8018 
df_start_date.dtypes

[('_c0', 'int'),
 ('userid', 'bigint'),
 ('username', 'string'),
 ('acctdesc', 'string'),
 ('location', 'string'),
 ('following', 'int'),
 ('followers', 'int'),
 ('totaltweets', 'int'),
 ('usercreatedts', 'string'),
 ('tweetid', 'bigint'),
 ('tweetcreatedts', 'string'),
 ('retweetcount', 'int'),
 ('text', 'string'),
 ('hashtags', 'string'),
 ('language', 'string'),
 ('coordinates', 'string'),
 ('favorite_count', 'int'),
 ('extractedts', 'string')]

In [6]:
#use this to determine which type of drop is needed. Also not all all dates have same number of columns 
len(df_start_date.columns)

18

In [7]:
from pyspark import SparkFiles
from pyspark.sql.functions import *

#use this to determine which type of drop is needed. Also not all all dates have same number of columns 
df_start_date_count = len(df_start_date.columns)
if df_start_date_count == 18:
  df_start_date_2 = df_start_date_2.withColumn("is_retweet", lit(None).cast('boolean')) 
  df_start_date_2 = df_start_date_2.withColumn("is_quote_status", lit(None).cast('boolean')) 

len(df_start_date_2.columns)

20

In [19]:
from pyspark.sql.functions import *

#1. Drop Uncessary Columns using the data set of 08-18-2022 as a starting point. 
cleaned_df = df_start_date_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates", "original_tweet_id", "original_tweet_userid", "original_tweet_username", "in_reply_to_status_id", "in_reply_to_screen_name", 'in_reply_to_user_id', "quoted_status_id", "quoted_status_username", "quoted_status_userid")

#2. change columns from string to year:month:day date format
cleaned_df = cleaned_df.withColumn("usercreatedts",to_date("usercreatedts"))
cleaned_df = cleaned_df.withColumn("tweetcreatedts",to_date("tweetcreatedts"))
cleaned_df = cleaned_df.withColumn("extractedts",to_date("extractedts"))

#3. filter out langauge = english only
cleaned_df = cleaned_df.filter(cleaned_df["language"]=="en")
cleaned_df = cleaned_df.filter(cleaned_df["usercreatedts"] < "2009-01-01")

cleaned_df.show(5)

+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|   username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|
+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|    Yaniela|     1158|      392|      88366|   2008-10-21|    2022-04-01|        3412|⚡The Ukrainian Ai...|                  []|      en|             0| 2022-04-01|      null|           null|
|   livemint|       70|  2033266|     394746|   2008-11-27|    2022-04-01|           2|India's purchase ...|[{'text': 'Russia...|      en|             7| 2022-04-01|      null|           null|
|   helchose|     2693|      818|  

In [20]:
cleaned_df_count = cleaned_df.count()
cleaned_df_count

6199

In [21]:
date_arr_april = [ "0402", "0403", "0404", "0405", "0406", "0407", "0408", "0409","0410", "0411", "0412", "0413", "0414", "0415", "0416", "0417", "0418", "0419", "0420", "0421", "0422", "0423", "0424", "0425", "0426", "0427", "0428", "0429", "0430"]

In [22]:
from pyspark import SparkFiles
from pyspark.sql.functions import *

url = "https://databootcamps3bucket.s3.us-west-2.amazonaws.com/ua_war/UkraineWar"

for day in date_arr_april:
      #load the data from aws
      aws_url = f"{url}/{day}_UATweets.csv.gz"
      spark.sparkContext.addFile(aws_url)
      temp_df = spark.read.option("delimiter", ",").option("encoding", "UTF-8").option("multiLine", True).option("escape", '"').csv(SparkFiles.get(f"{day}_UATweets.csv.gz"),  header=True, inferSchema=True)
      
      #keep count number of columns to determine which if else block it will hit
      temp_count = len(temp_df.columns)

      #change columns from string to year:month:day date format
      temp_df = temp_df.withColumn("usercreatedts",to_date("usercreatedts"))
      temp_df = temp_df.withColumn("tweetcreatedts",to_date("tweetcreatedts"))
      temp_df = temp_df.withColumn("extractedts",to_date("extractedts"))

      #filter out data for english only 
      temp_df = temp_df.filter(temp_df["language"]=="en")
      #filter out usercreated before 2009
      temp_df = temp_df.filter(temp_df["usercreatedts"] < "2009-01-01")

      #fill in null values 
      temp_df_2 = temp_df.na.fill("<empty>")

      #some days the data columns has less columns then other days 
      if temp_count == 18:
        temp_df_3 = temp_df_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates")
        temp_df_3 = temp_df_3.withColumn("is_retweet", lit(None).cast('boolean')) 
        temp_df_3 = temp_df_3.withColumn("is_quote_status", lit(None).cast('boolean')) 

      elif temp_count == 29:
        temp_df_3 = temp_df_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates", "original_tweet_id", "original_tweet_userid", "original_tweet_username", "in_reply_to_status_id", "in_reply_to_screen_name", 'in_reply_to_user_id', "quoted_status_id", "quoted_status_username", "quoted_status_userid")

      else:
        print(f"Error on {day}_UATweets.csv.gz and column count {temp_count}")

      cleaned_df = cleaned_df.unionByName(temp_df_3)
      print(f"{day} - {cleaned_df.count()}")


0402 - 12510
0403 - 19637
0404 - 26082
0405 - 32584
0406 - 39004
0407 - 44668
0408 - 50255
0409 - 56154
0410 - 61791
0411 - 67322
0412 - 72976
0413 - 78634
0414 - 83715
0415 - 89036
0416 - 93866
0417 - 98592
0418 - 103709
0419 - 109379
0420 - 114341
0421 - 119197
0422 - 123636
0423 - 128379
0424 - 133181
0425 - 137330
0426 - 142114
0427 - 146439
0428 - 150627
0429 - 154985
0430 - 159841


In [23]:
cleaned_df.show(5)

+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|   username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|
+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|    Yaniela|     1158|      392|      88366|   2008-10-21|    2022-04-01|        3412|⚡The Ukrainian Ai...|                  []|      en|             0| 2022-04-01|      null|           null|
|   livemint|       70|  2033266|     394746|   2008-11-27|    2022-04-01|           2|India's purchase ...|[{'text': 'Russia...|      en|             7| 2022-04-01|      null|           null|
|   helchose|     2693|      818|  

In [24]:
#add week column for "tweetcreatedts"
cleaned_df = cleaned_df.withColumn("weekofyear",weekofyear("tweetcreatedts"))

In [25]:
cleaned_df.show(5)

+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+----------+
|   username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|weekofyear|
+-----------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+----------+
|    Yaniela|     1158|      392|      88366|   2008-10-21|    2022-04-01|        3412|⚡The Ukrainian Ai...|                  []|      en|             0| 2022-04-01|      null|           null|        13|
|   livemint|       70|  2033266|     394746|   2008-11-27|    2022-04-01|           2|India's purchase ...|[{'text': 'Russia...|      en|             7| 2022-04-01|      null|        

In [26]:
cleaned_df.dtypes

[('username', 'string'),
 ('following', 'int'),
 ('followers', 'int'),
 ('totaltweets', 'int'),
 ('usercreatedts', 'date'),
 ('tweetcreatedts', 'date'),
 ('retweetcount', 'int'),
 ('text', 'string'),
 ('hashtags', 'string'),
 ('language', 'string'),
 ('favorite_count', 'int'),
 ('extractedts', 'date'),
 ('is_retweet', 'boolean'),
 ('is_quote_status', 'boolean'),
 ('weekofyear', 'int')]

### Connect to the AWS RDS instance and write each DataFrame to its table. 

In [27]:
# Store environmental variable
from getpass import getpass
password = getpass('Enter database password')
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://tweets.cnzbbvrrhst7.us-west-1.rds.amazonaws.com:5432/ua_data"
config = {"user":"uatweets", 
          "password": password, 
          "driver":"org.postgresql.Driver"}


Enter database password··········


In [28]:
# Write review_id_df to table in RDS
cleaned_df.write.jdbc(url=jdbc_url, table='tweets_table', mode=mode, properties=config)