In [1]:
from cassandra.cluster import Cluster
import csv

In [3]:
from pyspark.sql import SparkSession

In [4]:

spark = SparkSession.builder.appName("ca2_nov").getOrCreate()

In [5]:
folder = "hdfs://localhost:9000/user1/ProjectTweets.csv"

In [6]:
df = spark.read.csv(folder, header=False, inferSchema=True)

                                                                                

In [7]:
df = df.toDF("id", "indice", "date", "query", "user", "tweet")

In [8]:
df.head()

Row(id=0, indice=1467810369, date='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', tweet="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D")

In [9]:
df.printSchema()


root
 |-- id: integer (nullable = true)
 |-- indice: long (nullable = true)
 |-- date: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- tweet: string (nullable = true)



In [10]:
df = df.dropDuplicates(["tweet"])



In [11]:
# using legacy for date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")



In [12]:
from pyspark.sql.functions import to_date

In [13]:
# trasform date column from string to date
df_con_data = df.withColumn("date", to_date(df["date"], "E MMM dd HH:mm:ss z yyyy"))


In [14]:
df_con_data.show(1)

[Stage 5:>                                                          (0 + 1) / 1]

+-------+----------+----------+--------+--------+--------------------+
|     id|    indice|      date|   query|    user|               tweet|
+-------+----------+----------+--------+--------+--------------------+
|1429354|2059729483|2009-06-07|NO_QUERY|mtiishaw|       Hardest wo...|
+-------+----------+----------+--------+--------+--------------------+
only showing top 1 row



                                                                                

In [15]:
# check on null values
df_con_data.describe().show()




+-------+-----------------+--------------------+--------+--------------------+--------------------+
|summary|               id|              indice|   query|                user|               tweet|
+-------+-----------------+--------------------+--------+--------------------+--------------------+
|  count|          1581466|             1581466| 1581466|             1581466|             1581466|
|   mean|799948.3493486423|1.9985071970856838E9|    null| 4.418920826240876E9|                null|
| stddev|462023.2915345735|1.9365969167076474E8|    null|5.218769198801353...|                null|
|    min|                0|          1467810369|NO_QUERY|        000catnap000|                 ...|
|    max|          1599999|          2329205794|NO_QUERY|          zzzzeus111|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|
+-------+-----------------+--------------------+--------+--------------------+--------------------+



                                                                                

In [16]:
df_con_data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- indice: long (nullable = true)
 |-- date: date (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- tweet: string (nullable = true)



# from spark to mysql

In [17]:
import mysql.connector


In [18]:
# info 
hostname = "127.0.0.1"  
username = "root"  
password = "password" 
database_name = "ca2"  

# connection with database
conn = mysql.connector.connect(
    host=hostname,
    user=username,
    password=password,
    database=database_name
)


In [19]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("ca2_nov") \
    .config("spark.jars", "hdfs:///mysql-connector-j-8.2.0.jar") \
    .getOrCreate()
#open connection with spark

In [20]:
# info for connection to mysql
url = "jdbc:mysql://127.0.0.1:3306/ca2"
properties = {
    "user": "root",
    "password": "password",
    "driver": "com.mysql.cj.jdbc.Driver"
}





In [21]:
# copy df from spark to mysql
df_con_data.write.jdbc(url, "tweet", mode="append", properties=properties)

                                                                                

In [23]:
query = "SELECT * FROM tweet LIMIT 10"



In [25]:
query

'SELECT * FROM tweet LIMIT 10'

# from spark to cassandra

In [5]:
# node address
indirizzo_del_nodo_di_contatto = '127.0.0.1'

# cluster object
cluster = Cluster([indirizzo_del_nodo_di_contatto])

In [6]:
# cassandra session
session = cluster.connect()

# keyspace
session.execute("USE my_ca3")

<cassandra.cluster.ResultSet at 0x7f28abb324a0>

In [38]:
tweet_ca2 = """


CREATE TABLE IF NOT EXISTS my_ca3.tweet_ca2 (
    id INT PRIMARY KEY,
    indice BIGINT,
    date DATE,
    query TEXT,
    user TEXT,
    tweet TEXT
)



"""

session.execute(tweet_ca2)

<cassandra.cluster.ResultSet at 0x7f58462145e0>

In [39]:
table_name = "tweet_ca2"

df_con_data.write \
    .format("org.apache.spark.sql.cassandra") \
    .options(table=table_name, keyspace="my_ca3") \
    .option("confirm.truncate", "true") \
    .mode("overwrite") \
    .save()

                                                                                

In [43]:
query = "SELECT COUNT(*) FROM tweet_ca2"
result = session.execute(query)


In [44]:
count = result.one()[0]
print(f"Total number of rows is : {count}")

Total number of rows is : 1581466


In [7]:
query = "SELECT * FROM tweet_ca2"
result = session.execute(query)

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame


In [8]:
keyspace = "my_ca3"  # Specificy keyspace 
table_name = "tweet_ca2"  # Specify name table

# copy df from spark to cassandra
df = spark.read.format("org.apache.spark.sql.cassandra") \
    .options(table=table_name, keyspace=keyspace) \
    .load()



In [9]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- date: date (nullable = true)
 |-- indice: long (nullable = true)
 |-- query: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- user: string (nullable = true)



In [18]:
df.show(1)

+------+----------+----------+--------+--------------------+----------+
|    id|      date|    indice|   query|               tweet|      user|
+------+----------+----------+--------+--------------------+----------+
|584570|2009-06-18|2215319563|NO_QUERY|NOOOOOO!  floodin...|rchlwatson|
+------+----------+----------+--------+--------------------+----------+
only showing top 1 row



                                                                                

# sentiment analysis using Vader

In [10]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [11]:
from pyspark.sql.functions import udf

In [14]:
from pyspark.sql.types import FloatType

In [18]:
def sentiment_function(text):
    analyzer = SentimentIntensityAnalyzer()
    sent = analyzer.polarity_scores(text)
    return sent['compound']  # extract'compound' value from function'polarity_scores'


In [19]:

# record UDF
sentiment_udf = udf(sentiment_function, FloatType())

# applyl'UDF in new column "sentiment"
df_sent = df.withColumn("sentiment", sentiment_udf(df["tweet"]))


In [20]:
df_sent.printSchema()

root
 |-- id: integer (nullable = false)
 |-- date: date (nullable = true)
 |-- indice: long (nullable = true)
 |-- query: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- user: string (nullable = true)
 |-- sentiment: float (nullable = true)



In [21]:
df_sent.show(1)

[Stage 1:>                                                          (0 + 1) / 1]

+------+----------+----------+--------+--------------------+-------+---------+
|    id|      date|    indice|   query|               tweet|   user|sentiment|
+------+----------+----------+--------+--------------------+-------+---------+
|200596|2009-05-30|1971704740|NO_QUERY|lost my phone...a...|Nyabama|  -0.3182|
+------+----------+----------+--------+--------------------+-------+---------+
only showing top 1 row



                                                                                

In [22]:
from pyspark.sql.functions import when, col


In [23]:
# new column 'label'
df_sent = df_sent.withColumn("label_vader", 
    when(col("sentiment") > 0.2, 1)
    .when(col("sentiment") < -0.2, -1)
    .otherwise(0)
)




# sentiment analysis with TextBlob

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType
from textblob import TextBlob


In [29]:
def analyze_sentiment(text):
    analysis = TextBlob(text)
    sentiment_score = analysis.sentiment.polarity
    return float(sentiment_score)

sentiment_udf = udf(analyze_sentiment, FloatType())



In [30]:
df_blob = df.withColumn("sentiment", sentiment_udf(df["tweet"]))


In [31]:
from pyspark.sql.functions import struct

# new column with score 
df_blob = df_blob.withColumn("sentiment", sentiment_udf(df_blob["tweet"]))



In [32]:
df_blob.printSchema()

root
 |-- id: integer (nullable = false)
 |-- date: date (nullable = true)
 |-- indice: long (nullable = true)
 |-- query: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- user: string (nullable = true)
 |-- sentiment: float (nullable = true)



In [34]:
from pyspark.sql.functions import when, col

df_blob = df_blob.withColumn("label_blob", 
    when(col("sentiment") > 0.2, 1)
    .when(col("sentiment") < -0.2, -1)
    .otherwise(0)
)


In [35]:
df_blob.show(1)

[Stage 2:>                                                          (0 + 1) / 1]

+------+----------+----------+--------+--------------------+--------------+---------+----------+
|    id|      date|    indice|   query|               tweet|          user|sentiment|label_blob|
+------+----------+----------+--------+--------------------+--------------+---------+----------+
|958144|2009-05-17|1825636460|NO_QUERY|All I hear people...|iPhoneFuzzball|      0.0|         0|
+------+----------+----------+--------+--------------------+--------------+---------+----------+
only showing top 1 row



                                                                                