In [2]:
spark = SparkSession.builder.appName("session").getOrCreate() 
data = spark.read.csv("/user1/ProjectTweets.csv", inferSchema=True)

                                                                                

In [3]:
spark

In [4]:
data.createOrReplaceTempView("data")
data.show(1)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 1 row



In [5]:
if data.count() > 1000000:
    print(f"{data.count()} ... That's a lot of Data!!")

                                                                                

1600000 ... That's a lot of Data!!




In [6]:
data.select('_c3').distinct().collect()

                                                                                

[Row(_c3='NO_QUERY')]

In [7]:
data = data.drop(data._c3) #only one value in column - dropping column _c3

In [8]:
#checking for duplicates
data.groupby("_c1").count().where("count > 1").show()



+----------+-----+
|       _c1|count|
+----------+-----+
|1468544973|    2|
|1690908358|    2|
|1834777946|    2|
|1882160717|    2|
|1965601765|    2|
|1982434182|    2|
|2002309001|    2|
|2190980212|    2|
|1685304801|    2|
|1686371908|    2|
|1957194329|    2|
|1969964899|    2|
|1974268607|    2|
|2056807406|    2|
|2063670799|    2|
|1556266702|    2|
|1752414405|    2|
|1824843992|    2|
|1881996107|    2|
|1983726537|    2|
+----------+-----+
only showing top 20 rows



                                                                                

In [9]:
duplicates = data.groupby("_c1").count().where("count > 1").drop("count")
print(f"Number of duplicates: {duplicates.count()}")



Number of duplicates: 1685


                                                                                

visualizing duplicates

In [24]:
#duplicate example
data[data["_c1"] == 1983726537].show(truncate=False)

[Stage 28:>                                                         (0 + 1) / 1]

+-------+----------+----------------------------+-------+---------------------------------------------------------------------------------------------+
|_c0    |_c1       |_c2                         |_c4    |_c5                                                                                          |
+-------+----------+----------------------------+-------+---------------------------------------------------------------------------------------------+
|252393 |1983726537|Sun May 31 13:42:57 PDT 2009|iargent|Should have gone on a bike ride today but never quite happened  Still enjoyed the sun though |
|1190503|1983726537|Sun May 31 13:42:57 PDT 2009|iargent|Should have gone on a bike ride today but never quite happened  Still enjoyed the sun though |
+-------+----------+----------------------------+-------+---------------------------------------------------------------------------------------------+



                                                                                

In [26]:
data = data.dropDuplicates(['_c1'])
data.count()

                                                                                

1598315

In [102]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



Dealing with timestamp in pyspark

In [103]:
#example: 
data.first()["_c2"] #PDT stands for Pacific time zone

'Mon Apr 06 22:19:45 PDT 2009'

In [104]:
#let's check if all time stamps are in PDT
#if all strings have PDT in the timestamp this list should return empty
[x for x in data.rdd.toLocalIterator() if "PDT" not in x['_c2']]

[Stage 130:>                                                        (0 + 1) / 1]

[]

In [107]:
from pyspark.sql.functions import to_timestamp
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY") #Had to set as Legacy cause of error in 
#spark.conf.set("spark.sql.legacy.timeParserPolicy","CORRECTED") #for return to standard timeparser policy

Time_Format = "E MMM d HH:mm:ss z yyyy"
data = data.withColumn("Timestamp", to_timestamp(data["_c2"], Time_Format))
data = data.drop(data._c2)
data = data.drop(data._c0)
data.show()

+----------+---------------+--------------------+-------------------+
|       _c1|            _c4|                 _c5|          Timestamp|
+----------+---------------+--------------------+-------------------+
|1467810369|_TheSpecialOne_|@switchfoot http:...|2009-04-07 06:19:45|
|1467810672|  scotthamilton|is upset that he ...|2009-04-07 06:19:49|
|1467810917|       mattycus|@Kenichan I dived...|2009-04-07 06:19:53|
|1467811184|        ElleCTF|my whole body fee...|2009-04-07 06:19:57|
|1467811193|         Karoli|@nationwideclass ...|2009-04-07 06:19:57|
|1467811372|       joy_wolf|@Kwesidei not the...|2009-04-07 06:20:00|
|1467811592|        mybirch|         Need a hug |2009-04-07 06:20:03|
|1467811594|           coZZ|@LOLTrish hey  lo...|2009-04-07 06:20:03|
|1467811795|2Hood4Hollywood|@Tatiana_K nope t...|2009-04-07 06:20:05|
|1467812025|        mimismo|@twittera que me ...|2009-04-07 06:20:09|
|1467812416| erinx3leannexo|spring break in p...|2009-04-07 06:20:16|
|1467812579|   pardo

                                                                                

In [None]:
data.show()