In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import json
import scipy 
import torch
from torch.utils.data import Dataset, DataLoader
import logging
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Connecting to security.ub                                                                               Hit:2 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Connecting to security.ub0% [1 InRelease gpgv 15.9 kB] [Connecting to archive.ubuntu.com (91.189.91.39)]                                                                               Hit:3 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
0% [1 InRelease gpgv 15.9 kB] [Connecting to archive.ubuntu.com (91.189.91.39)]                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:6 http://security.ubuntu.com/ubuntu bion

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-08-30 03:00:56--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.2’


2022-08-30 03:00:57 (1.59 MB/s) - ‘postgresql-42.2.16.jar.2’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("UA_War").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
#import data from aws s3 bucket for 1 day. This will be the starting dataframe to populate DB

from pyspark import SparkFiles
start_date = "0301"

url = f"https://databootcamps3bucket.s3.us-west-2.amazonaws.com/ua_war/UkraineWar/{start_date}_UATweets.csv.gz"
spark.sparkContext.addFile(url)

df_start_date = spark.read.option("delimiter", ",").option("encoding", "UTF-8").option("multiLine", True).option("escape", '"').csv(SparkFiles.get(f"{start_date}_UATweets.csv.gz"),  header=True, inferSchema=True)

# df.show(truncate=False) 
df_start_date_2 = df_start_date.na.fill("<empty>")

df_start_date_2.show(5)

+------+----------+--------------+--------------------+--------------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+--------------------+
|   _c0|    userid|      username|            acctdesc|            location|following|followers|totaltweets|       usercreatedts|            tweetid|     tweetcreatedts|retweetcount|                text|            hashtags|language|coordinates|favorite_count|         extractedts|
+------+----------+--------------+--------------------+--------------------+---------+---------+-----------+--------------------+-------------------+-------------------+------------+--------------------+--------------------+--------+-----------+--------------+--------------------+
|149212|  37334464|DownandOut1489|Because He lives ...|             <empty>|      914|      759|      40561|2009-05-03 01:43:...|1498447926531887104|2022-

In [5]:
#see what the datatypes for df_8018 
df_start_date.dtypes

[('_c0', 'int'),
 ('userid', 'bigint'),
 ('username', 'string'),
 ('acctdesc', 'string'),
 ('location', 'string'),
 ('following', 'int'),
 ('followers', 'int'),
 ('totaltweets', 'int'),
 ('usercreatedts', 'string'),
 ('tweetid', 'bigint'),
 ('tweetcreatedts', 'string'),
 ('retweetcount', 'int'),
 ('text', 'string'),
 ('hashtags', 'string'),
 ('language', 'string'),
 ('coordinates', 'string'),
 ('favorite_count', 'int'),
 ('extractedts', 'string')]

In [6]:
#use this to determine which type of drop is needed. Also not all all dates have same number of columns 
len(df_start_date.columns)

18

In [7]:
from pyspark import SparkFiles
from pyspark.sql.functions import *

#use this to determine which type of drop is needed. Also not all all dates have same number of columns 
df_start_date_count = len(df_start_date.columns)
if df_start_date_count == 18:
  df_start_date_2 = df_start_date_2.withColumn("is_retweet", lit(None).cast('boolean')) 
  df_start_date_2 = df_start_date_2.withColumn("is_quote_status", lit(None).cast('boolean')) 

len(df_start_date_2.columns)

20

In [8]:
from pyspark.sql.functions import *

#1. Drop Uncessary Columns using the data set of 08-18-2022 as a starting point. 
cleaned_df = df_start_date_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates", "original_tweet_id", "original_tweet_userid", "original_tweet_username", "in_reply_to_status_id", "in_reply_to_screen_name", 'in_reply_to_user_id', "quoted_status_id", "quoted_status_username", "quoted_status_userid")

#2. change columns from string to year:month:day date format
cleaned_df = cleaned_df.withColumn("usercreatedts",to_date("usercreatedts"))
cleaned_df = cleaned_df.withColumn("tweetcreatedts",to_date("tweetcreatedts"))
cleaned_df = cleaned_df.withColumn("extractedts",to_date("extractedts"))

#3. filter out langauge = english only
cleaned_df = cleaned_df.filter(cleaned_df["language"]=="en")
cleaned_df = cleaned_df.filter(cleaned_df["usercreatedts"] < "2009-01-01")

cleaned_df.show(5)

+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|     username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|
+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|          dna|       24|  2264852|     647718|   2008-11-28|    2022-03-01|           0|.@peta urges gove...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      null|           null|
|       donnyd|     1269|     1668|      22199|   2008-03-10|    2022-03-01|       20587|Moscow undergroun...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      null|           null|
|     ABCNews4|     6527|

In [9]:
cleaned_df_count = cleaned_df.count()
cleaned_df_count

5819

In [10]:
date_arr_march = ["0302", "0303", "0304", "0305", "0306", "0307", "0308", "0309","0310", "0311", "0312", "0313", "0314", "0315", "0316", "0317", "0318", "0319", "0320", "0321", "0322", "0323", "0324", "0325", "0326", "0327_to_28", "0329", "0330", "0331"]

In [11]:
# reverse_march_arr = date_arr_march.reverse()
# reverse_march_arr

reverse_march_arr = date_arr_march[::-1] 

In [12]:
print(reverse_march_arr)

['0331', '0330', '0329', '0327_to_28', '0326', '0325', '0324', '0323', '0322', '0321', '0320', '0319', '0318', '0317', '0316', '0315', '0314', '0313', '0312', '0311', '0310', '0309', '0308', '0307', '0306', '0305', '0304', '0303', '0302']


In [13]:
from pyspark import SparkFiles
from pyspark.sql.functions import *

url = "https://databootcamps3bucket.s3.us-west-2.amazonaws.com/ua_war/UkraineWar"

for day in reverse_march_arr:
      #load the data from aws
      aws_url = f"{url}/{day}_UATweets.csv.gz"
      spark.sparkContext.addFile(aws_url)
      temp_df = spark.read.option("delimiter", ",").option("encoding", "UTF-8").option("multiLine", True).option("escape", '"').csv(SparkFiles.get(f"{day}_UATweets.csv.gz"),  header=True, inferSchema=True)
      
      #keep count number of columns to determine which if else block it will hit
      temp_count = len(temp_df.columns)

      #change columns from string to year:month:day date format
      temp_df = temp_df.withColumn("usercreatedts",to_date("usercreatedts"))
      temp_df = temp_df.withColumn("tweetcreatedts",to_date("tweetcreatedts"))
      temp_df = temp_df.withColumn("extractedts",to_date("extractedts"))

      #filter out data for english only 
      temp_df = temp_df.filter(temp_df["language"]=="en")
      #filter out usercreated after 2009 
      temp_df = temp_df.filter(temp_df["usercreatedts"] < "2009-01-01")

      #fill in null values 
      temp_df_2 = temp_df.na.fill("<empty>")

      #some days the data columns has less columns then other days 
      if temp_count == 18:
        temp_df_3 = temp_df_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates")
        temp_df_3 = temp_df_3.withColumn("is_retweet", lit(None).cast('boolean')) 
        temp_df_3 = temp_df_3.withColumn("is_quote_status", lit(None).cast('boolean')) 

      elif temp_count == 29:
        temp_df_3 = temp_df_2.drop("userid", "_c0", "acctdesc", "location", "tweetid", "coordinates", "original_tweet_id", "original_tweet_userid", "original_tweet_username", "in_reply_to_status_id", "in_reply_to_screen_name", 'in_reply_to_user_id', "quoted_status_id", "quoted_status_username", "quoted_status_userid")

      else:
        print(f"Error on {day}_UATweets.csv.gz and column count {temp_count}")

      cleaned_df = cleaned_df.unionByName(temp_df_3)
      print(f"{day} - {cleaned_df.count()}")


0331 - 11220
0330 - 16574
0329 - 22351
0327_to_28 - 34582
0326 - 41512
0325 - 48762
0324 - 55175
0323 - 62027
0322 - 68907
0321 - 77402
0320 - 84582
0319 - 91720
0318 - 100011
0317 - 108256
0316 - 115944
0315 - 124412
0314 - 131508
0313 - 138839
0312 - 145564
0311 - 152725
0310 - 160400
0309 - 168261
0308 - 177108
0307 - 187120
0306 - 196191
0305 - 204519
0304 - 211474
0303 - 217086
0302 - 223023


In [14]:
cleaned_df.show(5)

+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|     username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|
+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+
|          dna|       24|  2264852|     647718|   2008-11-28|    2022-03-01|           0|.@peta urges gove...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      null|           null|
|       donnyd|     1269|     1668|      22199|   2008-03-10|    2022-03-01|       20587|Moscow undergroun...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      null|           null|
|     ABCNews4|     6527|

In [15]:
#add week column for "tweetcreatedts"
cleaned_df = cleaned_df.withColumn("weekofyear",weekofyear("tweetcreatedts"))

In [16]:
cleaned_df.show(5)

+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+----------+
|     username|following|followers|totaltweets|usercreatedts|tweetcreatedts|retweetcount|                text|            hashtags|language|favorite_count|extractedts|is_retweet|is_quote_status|weekofyear|
+-------------+---------+---------+-----------+-------------+--------------+------------+--------------------+--------------------+--------+--------------+-----------+----------+---------------+----------+
|          dna|       24|  2264852|     647718|   2008-11-28|    2022-03-01|           0|.@peta urges gove...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      null|           null|         9|
|       donnyd|     1269|     1668|      22199|   2008-03-10|    2022-03-01|       20587|Moscow undergroun...|[{'text': 'Ukrain...|      en|             0| 2022-03-01|      nul

In [17]:
cleaned_df.dtypes

[('username', 'string'),
 ('following', 'int'),
 ('followers', 'int'),
 ('totaltweets', 'int'),
 ('usercreatedts', 'date'),
 ('tweetcreatedts', 'date'),
 ('retweetcount', 'int'),
 ('text', 'string'),
 ('hashtags', 'string'),
 ('language', 'string'),
 ('favorite_count', 'int'),
 ('extractedts', 'date'),
 ('is_retweet', 'boolean'),
 ('is_quote_status', 'boolean'),
 ('weekofyear', 'int')]

In [18]:
cleaned_df.count()

223023

### Connect to the AWS RDS instance and write each DataFrame to its table. 

In [21]:
# Store environmental variable
from getpass import getpass
password = getpass('Enter database password')
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://tweets.cnzbbvrrhst7.us-west-1.rds.amazonaws.com:5432/ua_data"
config = {"user":"uatweets", 
          "password": password, 
          "driver":"org.postgresql.Driver"}


Enter database password··········


In [22]:
# Write review_id_df to table in RDS
cleaned_df.write.jdbc(url=jdbc_url, table='tweets_table', mode=mode, properties=config)