In [1]:
## libraries
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, trim, first ,when,to_timestamp,udf, from_unixtime,regexp_replace

from pyspark.sql.types import ArrayType, IntegerType,DoubleType
import io
import matplotlib.pyplot as plt

# Suppress warning messages
import warnings
warnings.filterwarnings("ignore")

In [2]:
spark = SparkSession.builder.master("local").config("spark.driver.memory", "15g").appName("CsvReader").getOrCreate()


In [3]:
## read csv 
twitter_data = spark.read.format("csv").load("file:///home/hduser/Desktop/CA2_TweetAnalysis/ProjectTweets.csv")

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [4]:
twitter_data.show(10)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  6|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  8|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [5]:
print(twitter_data.columns)

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']


In [6]:
# Rename columns
twitter_data = twitter_data.selectExpr("_c1 as id", "_c2 as date", "_c3 as flag", "_c4 as user", "_c5 as text")

In [7]:
twitter_data.show(10)

+----------+--------------------+--------+---------------+--------------------+
|        id|                date|    flag|           user|                text|
+----------+--------------------+--------+---------------+--------------------+
|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|
|1467812025|Mon Apr 06 22:20:...|NO_QUER

In [8]:
twitter_data.count()



1600000

In [9]:
#check tweet and date format
first_row = twitter_data.first()
print(first_row['text'])

print(first_row['date'])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Mon Apr 06 22:19:45 PDT 2009


In [10]:
# Use regexp_replace to remove PDT from the date_string
twitter_data = twitter_data.withColumn("date", regexp_replace(twitter_data["date"], " PDT", ""))

first_row=twitter_data.first()
print(first_row['text'])

print(first_row['date'])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Mon Apr 06 22:19:45 2009


In [11]:
!pip install pymongo


Defaulting to user installation because normal site-packages is not writeable


## Mongo DB

In [12]:
##mongodb connection
import pymongo
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["twitter_data"]
collection = db["tweets"]

In [13]:
#insert data to mongodb 
rows = twitter_data.collect()
data_list = [row.asDict() for row in rows]

#insert
for data in data_list:
    collection.insert_one(data)


                                                                                

In [14]:
#read data from mongodb
# Retrieve the data from the MongoDB collection
mongo_data = list(collection.find().limit(5))
mongo_data

[{'_id': ObjectId('654aaba9cba3cae74a0100a9'),
  'id': '1467810369',
  'date': 'Mon Apr 06 22:19:45 PDT 2009',
  'flag': 'NO_QUERY',
  'user': '_TheSpecialOne_',
  'text': "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"},
 {'_id': ObjectId('654aabaacba3cae74a0100aa'),
  'id': '1467810672',
  'date': 'Mon Apr 06 22:19:49 PDT 2009',
  'flag': 'NO_QUERY',
  'user': 'scotthamilton',
  'text': "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"},
 {'_id': ObjectId('654aabaacba3cae74a0100ab'),
  'id': '1467810917',
  'date': 'Mon Apr 06 22:19:53 PDT 2009',
  'flag': 'NO_QUERY',
  'user': 'mattycus',
  'text': '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'},
 {'_id': ObjectId('654aabaacba3cae74a0100ac'),
  'id': '1467811184',
  'date': 'Mon Apr 06 22:19:57 PDT 2009',
  'flag': 'NO_QUERY',
  'user': 'ElleCTF',
  'text':

## Spark SQL

In [None]:
## spark sql
twitter_data.createOrReplaceTempView("twitter_data")
result = spark.sql("SELECT * FROM twitter_data")
result.show()

In [None]:
result.count()
