In [1]:
from cassandra.cluster import Cluster
import csv

In [2]:
from pyspark.sql import SparkSession

In [3]:

spark = SparkSession.builder.appName("ca2_nov").getOrCreate()

In [4]:
folder = "hdfs://localhost:9000/user1/ProjectTweets.csv"

In [51]:
df = spark.read.csv(folder, header=False, inferSchema=True)

                                                                                

In [52]:
df = df.toDF("id", "indice", "date", "query", "user", "tweet")

In [53]:
df.head()

Row(id=0, indice=1467810369, date='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', tweet="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D")

In [59]:
df.printSchema()


root
 |-- id: integer (nullable = true)
 |-- indice: long (nullable = true)
 |-- date: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- tweet: string (nullable = true)



In [55]:
df = df.dropDuplicates(["tweet"])



In [56]:
# using legacy for date parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")



In [57]:
# trasform date column from string to date
df_con_data = df.withColumn("date", to_date(df["date"], "E MMM dd HH:mm:ss z yyyy"))


In [58]:
df_con_data.show(1)



+-------+----------+----------+--------+--------+--------------------+
|     id|    indice|      date|   query|    user|               tweet|
+-------+----------+----------+--------+--------+--------------------+
|1429354|2059729483|2009-06-07|NO_QUERY|mtiishaw|       Hardest wo...|
+-------+----------+----------+--------+--------+--------------------+
only showing top 1 row



                                                                                

In [61]:
# check on null values
df_con_data.describe().show()




+-------+-----------------+--------------------+--------+--------------------+--------------------+
|summary|               id|              indice|   query|                user|               tweet|
+-------+-----------------+--------------------+--------+--------------------+--------------------+
|  count|          1581466|             1581466| 1581466|             1581466|             1581466|
|   mean|799948.3493486423|1.9985071970856838E9|    null| 4.418920826240876E9|                null|
| stddev|462023.2915345735|1.9365969167076474E8|    null|5.218769198801353...|                null|
|    min|                0|          1467810369|NO_QUERY|        000catnap000|                 ...|
|    max|          1599999|          2329205794|NO_QUERY|          zzzzeus111|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|
+-------+-----------------+--------------------+--------+--------------------+--------------------+



                                                                                

In [None]:
# sentiment analysis using Vader

In [62]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [63]:
def sentiment_function(text):
    analyzer = SentimentIntensityAnalyzer()
    sent = analyzer.polarity_scores(text)
    return sentiment


In [97]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# define data type 
sentiment_schema = StructType([
    StructField("compound", FloatType())
])

# UDF (User-Defined Function) for sentiment analysis
sentiment_udf = udf(sentiment_function, sentiment_schema)

# new column with results 
df_sent = df_con_data.withColumn("sentiment_v", sentiment_udf(df_con_data["tweet"]))


In [98]:
df_sent.show(1)

ERROR:root:KeyboardInterrupt while sending command.][Stage 55:>   (0 + 0) / 2]
Traceback (most recent call last):
  File "/home/hduser/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/hduser/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [99]:
from pyspark.sql.functions import when, col


In [100]:
# new column 'label'
df_sent = df_sent.withColumn("label_vader", 
    when(col("sentiment_v.compound") > 0.2, 1)
    .when(col("sentiment_v.compound") < -0.2, -1)
    .otherwise(0)
)




In [75]:
df_sent.show(1)

[Stage 52:>                                                         (0 + 1) / 1]

+-------+----------+----------+--------+--------+--------------------+---------+-----+
|     id|    indice|      date|   query|    user|               tweet|sentiment|label|
+-------+----------+----------+--------+--------+--------------------+---------+-----+
|1429354|2059729483|2009-06-07|NO_QUERY|mtiishaw|       Hardest wo...| {0.2732}|    1|
+-------+----------+----------+--------+--------+--------------------+---------+-----+
only showing top 1 row



2023-11-07 23:26:26,584 WARN python.PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 52 (TID 64): Attempting to kill Python Worker
                                                                                

In [102]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType
from textblob import TextBlob


In [110]:
def analyze_sentiment(text):
    analysis = TextBlob(text)
    sentiment_score = analysis.sentiment.polarity
    return float(sentiment_score)

sentiment_udf = udf(analyze_sentiment, FloatType())


In [111]:
df_sent = df_sent.withColumn("sentiment_blob", sentiment_udf(df_sent["tweet"]))


In [115]:
from pyspark.sql.functions import struct

# new column with score 
df_sent = df_sent.withColumn("sentiment_blob", sentiment_udf(df_sent["tweet"]))



In [118]:
from pyspark.sql.functions import struct, col

# new column structured using float column
df_sent = df_sent.withColumn("structured_sentiment", struct(col("sentiment_blob").alias("compound")))


In [135]:
df_sent.printSchema()

root
 |-- id: integer (nullable = true)
 |-- indice: long (nullable = true)
 |-- date: date (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- label_vader: integer (nullable = false)
 |-- sentiment_blob: float (nullable = true)
 |-- label_blob: integer (nullable = false)
 |-- compound_vader: float (nullable = true)
 |-- compound_blob: float (nullable = true)



[Stage 53:>   (0 + 2) / 2][Stage 54:>   (0 + 0) / 2][Stage 55:>   (0 + 0) / 2]

In [122]:
from pyspark.sql.functions import when, col

df_sent = df_sent.withColumn("label_blob", 
    when(col("structured_sentiment.compound") > 0.2, 1)
    .when(col("structured_sentiment.compound") < -0.2, -1)
    .otherwise(0)
)


In [134]:
from pyspark.sql.functions import col

# extract compound from sentiment vader
df_sent = df_sent.withColumn("compound_vader", col("sentiment_v.compound"))

# extract compound from sentiment blob
df_sent = df_sent.withColumn("compound_blob", col("structured_sentiment.compound"))

df_sent = df_sent.drop("sentiment_v", "structured_sentiment")


# cassandra

In [124]:
# node address
indirizzo_del_nodo_di_contatto = '127.0.0.1'

# cluster object
cluster = Cluster([indirizzo_del_nodo_di_contatto])



In [125]:
# cassandra session
session = cluster.connect()

# keyspace
session.execute("USE my_ca3")

ERROR:cassandra.connection:Closing connection <AsyncoreConnection(140136566225936) 127.0.0.1:9042> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


<cassandra.cluster.ResultSet at 0x7f7401322380>

[Stage 53:>   (0 + 2) / 2][Stage 54:>   (0 + 0) / 2][Stage 55:>   (0 + 0) / 2]

In [138]:
tweet_ca = """


CREATE TABLE IF NOT EXISTS tweet_ca (
    id INT PRIMARY KEY,
    indice BIGINT,
    date DATE,
    query TEXT,
    user TEXT,
    tweet TEXT,
    label_vader INT,
    sentiment_blob FLOAT,
    label_blob INT,
    compound_vader FLOAT,
    compound_blob FLOAT
)



"""

session.execute(tweet_ca)


<cassandra.cluster.ResultSet at 0x7f7401c8bf70>

In [None]:
table_name = "tweet_ca"

df_sent.write \
    .format("org.apache.spark.sql.cassandra") \
    .options(table=table_name, keyspace="my_ca3") \
    .option("confirm.truncate", "true") \
    .mode("overwrite") \
    .save()



[Stage 53:>   (0 + 2) / 2][Stage 54:>   (0 + 0) / 2][Stage 55:>   (0 + 0) / 2]

In [6]:
query = "SELECT COUNT(*) FROM tweet_ca"
result = session.execute(query)



In [7]:
count = result.one()[0]
print(f"Il conteggio delle righe nella tabella 'tweets' è: {count}")

Il conteggio delle righe nella tabella 'tweets' è: 1600000


In [8]:
query = "SELECT * FROM tweets"
result = session.execute(query)


In [9]:
data = list(result)

In [11]:
import pandas as pd

In [12]:
df = pd.DataFrame(data)

In [13]:
df.head()

Unnamed: 0,id,date,indice,query,tweet,user
0,302602,Mon Jun 01 19:37:16 PDT 2009,1999088121,NO_QUERY,Had problems with my eyes all day. This sux!,Bobina11
1,531141,Tue Jun 16 12:10:07 PDT 2009,2196209437,NO_QUERY,don't fall unless someone is willing to catch ...,rosie_carter427
2,1416569,Sat Jun 06 13:00:30 PDT 2009,2057439493,NO_QUERY,"Nice afternoon for a Boil here - carrots, pota...",zuluranger
3,693077,Sat Jun 20 06:42:25 PDT 2009,2252644159,NO_QUERY,miss you batang KYUT na BOUNCE...huhu..,chocomallows19
4,4317,Tue Apr 07 03:45:16 PDT 2009,1468791098,NO_QUERY,"Oh no, more babysitting! I love how she sprin...",tynie626
