## Check PySpark and EMR cluster environment

In [1]:
sc.version

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1603922407613_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'2.4.6-amzn-0'

In [2]:
sc.master

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'yarn'

## Read data into spark dataframe

In [15]:
path = 's3://bauka-big-tweets/text.csv'
df = spark.read.csv(path, header=True, inferSchema=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+
|_c0| tokens_back_to_text|
+---+--------------------+
|  0|rudygiuliani comp...|
|  1|      trump machismo|
|  2|briantylercohen b...|
+---+--------------------+
only showing top 3 rows

In [24]:
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _c0: integer (nullable = true)
 |-- tokens_back_to_text: string (nullable = true)

## Pre-processing

In [55]:
# Convert the data type of tweets into an array type as this is what is required for creating word2vec
from pyspark.sql.functions import array
df_array = df.withColumn("text_array", array("tokens_back_to_text"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [56]:
df_array.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+
|_c0| tokens_back_to_text|          text_array|
+---+--------------------+--------------------+
|  0|rudygiuliani comp...|[rudygiuliani com...|
|  1|      trump machismo|    [trump machismo]|
|  2|briantylercohen b...|[briantylercohen ...|
+---+--------------------+--------------------+
only showing top 3 rows

In [57]:
# Fit a CountVectorizerModel from the corpus
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="text_array", outputCol="features")
model = cv.fit(df_array)
result = model.transform(df_array)
result.show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+--------------------+
|_c0| tokens_back_to_text|          text_array|            features|
+---+--------------------+--------------------+--------------------+
|  0|rudygiuliani comp...|[rudygiuliani com...| (118918,[57],[1.0])|
|  1|      trump machismo|    [trump machismo]|(118918,[14867],[...|
|  2|briantylercohen b...|[briantylercohen ...|(118918,[289],[1.0])|
+---+--------------------+--------------------+--------------------+
only showing top 3 rows

## Modeling - KMeans Clustering

Create labels for tweets using KMeans clustering. Once the data is labeled, we can further apply deep learning model to predict labels for tweets.

In [39]:
# Import Kmeans from MLib
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=5, seed=1) # instantiate KMeans with the desired number of clusters
model = kmeans.fit(result.select('features'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
transformed = model.transform(result)
transformed.show(10)    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+--------------------+--------------------+----------+
|_c0| tokens_back_to_text|          text_array|            features|prediction|
+---+--------------------+--------------------+--------------------+----------+
|  0|rudygiuliani comp...|[rudygiuliani com...| (118918,[57],[1.0])|         0|
|  1|      trump machismo|    [trump machismo]|(118918,[22919],[...|         0|
|  2|briantylercohen b...|[briantylercohen ...|(118918,[290],[1.0])|         0|
|  3|bradleywhitford y...|[bradleywhitford ...|(118918,[6009],[1...|         0|
|  4|actbrigitte presi...|[actbrigitte pres...| (118918,[92],[1.0])|         0|
|  5|timcast come neve...|[timcast come nev...|(118918,[1071],[1...|         0|
|  6|bkbaguley afcoory...|[bkbaguley afcoor...|(118918,[24396],[...|         0|
|  8|tedlieu dear real...|[tedlieu dear rea...|(118918,[409],[1.0])|         0|
|  9|realjameswoods pr...|[realjameswoods p...| (118918,[32],[1.0])|         0|
| 10|break abc town ha...|[break abc tow

In [41]:
transformed.groupby('prediction').count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[prediction: int, count: bigint]

In [44]:
sc.install_pypi_package("pandas==0.25.1")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas==0.25.1
  Downloading https://files.pythonhosted.org/packages/7e/ab/ea76361f9d3e732e114adcd801d2820d5319c23d0ac5482fa3b412db217e/pandas-0.25.1-cp37-cp37m-manylinux1_x86_64.whl (10.4MB)
Collecting python-dateutil>=2.6.1 (from pandas==0.25.1)
  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-0.25.1 python-dateutil-2.8.1

In [45]:
import pandas as pd

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [46]:
data = transformed.toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
data.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

   _c0  ... prediction
0    0  ...          0
1    1  ...          0
2    2  ...          0
3    3  ...          0
4    4  ...          0

[5 rows x 5 columns]

In [50]:
data.columns

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Index(['_c0', 'tokens_back_to_text', 'text_array', 'features', 'prediction'], dtype='object')

In [51]:
data['prediction'].value_counts()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0    405127
4       787
2       769
3       690
1       393
Name: prediction, dtype: int64

In [54]:
data.to_json('labeled_data.json', orient='table')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Errno 13] Permission denied: 'labeled_data.json'
Traceback (most recent call last):
  File "/tmp/1603922840811-0/lib/python3.7/site-packages/pandas/core/generic.py", line 2424, in to_json
    index=index,
  File "/tmp/1603922840811-0/lib/python3.7/site-packages/pandas/io/json/_json.py", line 85, in to_json
    fh, handles = _get_handle(path_or_buf, "w", compression=compression)
  File "/tmp/1603922840811-0/lib/python3.7/site-packages/pandas/io/common.py", line 402, in _get_handle
    f = open(path_or_buf, mode, errors="replace", newline="")
PermissionError: [Errno 13] Permission denied: 'labeled_data.json'



In [8]:
print(sc.list_packages())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package                    Version  
-------------------------- ---------
beautifulsoup4             4.9.1    
boto                       2.49.0   
click                      7.1.2    
jmespath                   0.10.0   
joblib                     0.16.0   
lxml                       4.5.2    
mysqlclient                1.4.2    
nltk                       3.5      
nose                       1.3.4    
numpy                      1.16.5   
pip                        9.0.1    
py-dateutil                2.2      
python37-sagemaker-pyspark 1.4.0    
pytz                       2020.1   
PyYAML                     5.3.1    
regex                      2020.7.14
setuptools                 28.8.0   
six                        1.13.0   
soupsieve                  1.9.5    
tqdm                       4.48.2   
wheel                      0.29.0   
windmill                   1.6      

None

In [13]:
sc.install_pypi_package("tensorflow")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting tensorflow
  Downloading https://files.pythonhosted.org/packages/f4/28/96efba1a516cdacc2e2d6d081f699c001d414cc8ca3250e6d59ae657eb2b/tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (109.3MB)
Collecting absl-py>=0.7.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/bc/58/0aa6fb779dc69cfc811df3398fcbeaeefbf18561b6e36b185df0782781cc/absl_py-0.11.0-py3-none-any.whl (127kB)
Collecting protobuf>=3.6.1 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/71/dc/5ba56eab7440c62c5f808b4267e2a1d6c136e90293b43fefb1b493c6d704/protobuf-3.13.0-cp37-cp37m-manylinux1_x86_64.whl (1.3MB)
Collecting keras-applications>=1.0.6 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)
Collecting grpcio>=1.8.6 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/c3/4b/b8a3e1951c90c3e14a657f3682b705840c146c9c

In [14]:
import tensorflow as tf

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

No module named 'tensorflow'
Traceback (most recent call last):
ModuleNotFoundError: No module named 'tensorflow'

