## Check PySpark and EMR cluster environment

In [1]:
sc.version

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1604436459987_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'2.4.6-amzn-0'

In [2]:
sc.master

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'yarn'

## Read data into spark dataframe

In [3]:
path = 's3://bauka-big-tweets/text.csv'
df = spark.read.csv(path, header=True, inferSchema=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+
|_c0| tokens_back_to_text|
+---+--------------------+
|  0|rudygiuliani comp...|
|  1|      trump machismo|
|  2|briantylercohen b...|
+---+--------------------+
only showing top 3 rows

In [5]:
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _c0: integer (nullable = true)
 |-- tokens_back_to_text: string (nullable = true)

## Pre-processing

In [6]:
# Convert the data type of tweets into an array type as this is what is required for creating word2vec
from pyspark.sql.functions import array
df_array = df.withColumn("text_array", array("tokens_back_to_text"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
df_array.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+
|_c0| tokens_back_to_text|          text_array|
+---+--------------------+--------------------+
|  0|rudygiuliani comp...|[rudygiuliani com...|
|  1|      trump machismo|    [trump machismo]|
|  2|briantylercohen b...|[briantylercohen ...|
+---+--------------------+--------------------+
only showing top 3 rows

In [8]:
# TF with CountVectorizer
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="text_array", outputCol="features")
model = cv.fit(df_array)
result = model.transform(df_array)
result.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+--------------------+
|_c0| tokens_back_to_text|          text_array|            features|
+---+--------------------+--------------------+--------------------+
|  0|rudygiuliani comp...|[rudygiuliani com...| (118918,[57],[1.0])|
|  1|      trump machismo|    [trump machismo]|(118918,[23536],[...|
|  2|briantylercohen b...|[briantylercohen ...|(118918,[290],[1.0])|
+---+--------------------+--------------------+--------------------+
only showing top 3 rows

In [9]:
# IDF
from pyspark.ml.feature import IDF
idf = IDF(inputCol='features', outputCol='tfidf')
idfModel = idf.fit(result)
result_tfidf = idfModel.transform(result)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
result_tfidf.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+--------------------+--------------------+
|_c0| tokens_back_to_text|          text_array|            features|               tfidf|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|rudygiuliani comp...|[rudygiuliani com...| (118918,[57],[1.0])|(118918,[57],[6.8...|
|  1|      trump machismo|    [trump machismo]|(118918,[23536],[...|(118918,[23536],[...|
|  2|briantylercohen b...|[briantylercohen ...|(118918,[290],[1.0])|(118918,[290],[7....|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

## Modeling - KMeans Clustering

Create labels for tweets using KMeans clustering. Once the data is labeled, we can further apply deep learning model to predict labels for tweets.

In [24]:
# Import Kmeans from MLib
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1) # instantiate KMeans with the desired number of clusters
model = kmeans.fit(result.select('features'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# Combine lables with the data set
transformed = model.transform(result)
transformed.show(10)    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+--------------------+----------+
|_c0| tokens_back_to_text|          text_array|            features|prediction|
+---+--------------------+--------------------+--------------------+----------+
|  0|rudygiuliani comp...|[rudygiuliani com...| (118918,[57],[1.0])|         0|
|  1|      trump machismo|    [trump machismo]|(118918,[23536],[...|         0|
|  2|briantylercohen b...|[briantylercohen ...|(118918,[290],[1.0])|         0|
|  3|bradleywhitford y...|[bradleywhitford ...|(118918,[5929],[1...|         0|
|  4|actbrigitte presi...|[actbrigitte pres...| (118918,[92],[1.0])|         0|
|  5|timcast come neve...|[timcast come nev...|(118918,[1067],[1...|         0|
|  6|bkbaguley afcoory...|[bkbaguley afcoor...|(118918,[20298],[...|         0|
|  8|tedlieu dear real...|[tedlieu dear rea...|(118918,[409],[1.0])|         0|
|  9|realjameswoods pr...|[realjameswoods p...| (118918,[32],[1.0])|         0|
| 10|break abc town ha...|[break abc tow

In [14]:
transformed.groupby('prediction').count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[prediction: int, count: bigint]

In [13]:
sc.install_pypi_package("pandas==0.25.1")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas==0.25.1
  Downloading https://files.pythonhosted.org/packages/7e/ab/ea76361f9d3e732e114adcd801d2820d5319c23d0ac5482fa3b412db217e/pandas-0.25.1-cp37-cp37m-manylinux1_x86_64.whl (10.4MB)
Collecting python-dateutil>=2.6.1 (from pandas==0.25.1)
  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-0.25.1 python-dateutil-2.8.1

In [14]:
import pandas as pd

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
data = transformed.toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
data.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

   _c0  ... prediction
0    0  ...          0
1    1  ...          0
2    2  ...          0
3    3  ...          0
4    4  ...          0

[5 rows x 5 columns]

In [28]:
data.columns

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Index(['_c0', 'tokens_back_to_text', 'text_array', 'features', 'prediction'], dtype='object')

In [29]:
data['prediction'].value_counts()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0    407662
1       104
Name: prediction, dtype: int64

In [30]:
data['features'].head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: features, dtype: object

In [32]:
# Export the data into S3 in parqquet format to be further used for deep learning model in sagemaker
transformed.write.parquet("s3a://bauka-big-tweets/tweets_2class.parquet")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…