In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

In [4]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F

In [6]:
df = spark.read.parquet("s3://miaowang1009/ANLY502_Final/df.parquet/*")

In [13]:
df_feat = df.select(df.AvgTone, df.NumArticles,df.NumMentions, df.ActionGeo_Type, df.EventRootCode, df.QuadClass)

In [14]:
df_feat.printSchema()

root
 |-- AvgTone: float (nullable = true)
 |-- NumArticles: integer (nullable = true)
 |-- NumMentions: integer (nullable = true)
 |-- ActionGeo_Type: string (nullable = true)
 |-- EventRootCode: string (nullable = true)
 |-- QuadClass: string (nullable = true)



In [15]:
# Create add new column to the dataset

from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[ -100, 0, 100, float('Inf') ],inputCol="AvgTone", outputCol="AvgToneBin")
df_feat = bucketizer.setHandleInvalid("keep").transform(df_feat)

#df_buck.show()

from pyspark.sql.functions import udf
from pyspark.sql.types import *

t = {0.0:"Negtive", 1.0:"Positive"}
udf_foo = udf(lambda x: t[x], StringType())
df_feat = df_feat.withColumn("AvgToneTag", udf_foo("AvgToneBin"))
df_feat.show()

+-----------+-----------+-----------+--------------+-------------+---------+----------+----------+
|    AvgTone|NumArticles|NumMentions|ActionGeo_Type|EventRootCode|QuadClass|AvgToneBin|AvgToneTag|
+-----------+-----------+-----------+--------------+-------------+---------+----------+----------+
|  0.7025761|         10|         10|             4|           04|        1|       1.0|  Positive|
|-0.73937154|          6|          6|             5|           01|        1|       0.0|   Negtive|
|-0.73937154|          2|          2|             4|           01|        1|       0.0|   Negtive|
|-0.73937154|          2|          2|             1|           04|        1|       0.0|   Negtive|
|  0.7025761|          2|          2|             4|           04|        1|       1.0|  Positive|
|-0.73937154|          6|          6|             4|           03|        1|       0.0|   Negtive|
|  0.7025761|         10|         10|             4|           04|        1|       1.0|  Positive|
| -2.13114

In [16]:
df_features = df_feat.drop('AvgTone').drop('AvgToneTag')

In [17]:
df_features.columns

['NumArticles',
 'NumMentions',
 'ActionGeo_Type',
 'EventRootCode',
 'QuadClass',
 'AvgToneBin']

In [18]:
df_features.printSchema()

root
 |-- NumArticles: integer (nullable = true)
 |-- NumMentions: integer (nullable = true)
 |-- ActionGeo_Type: string (nullable = true)
 |-- EventRootCode: string (nullable = true)
 |-- QuadClass: string (nullable = true)
 |-- AvgToneBin: double (nullable = true)



In [19]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [20]:
# Selecting categorical features
categorical_columns = ['ActionGeo_Type', 'EventRootCode', 'QuadClass']

In [21]:
# The index of string values multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns]
# The encode of indexed values multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers]

In [22]:
# Vectorizing encoded values
categorical_encoded = [encoder.getOutputCol() for encoder in encoders]
numerical_columns = ['NumArticles', 'NumMentions']
inputcols = categorical_encoded + numerical_columns
assembler = VectorAssembler(inputCols=inputcols, outputCol="features")

In [23]:
# Now, we will set up a pipeline to automatize this stages.

pipeline = Pipeline(stages=indexers + encoders+[assembler])
model = pipeline.fit(df_features)
# Transform data
transformed = model.transform(df_features)
display(transformed)
transformed.take(1)

DataFrame[NumArticles: int, NumMentions: int, ActionGeo_Type: string, EventRootCode: string, QuadClass: string, AvgToneBin: double, ActionGeo_Type_indexed: double, EventRootCode_indexed: double, QuadClass_indexed: double, ActionGeo_Type_indexed_encoded: vector, EventRootCode_indexed_encoded: vector, QuadClass_indexed_encoded: vector, features: vector]

[Row(NumArticles=10, NumMentions=10, ActionGeo_Type='4', EventRootCode='04', QuadClass='1', AvgToneBin=1.0, ActionGeo_Type_indexed=0.0, EventRootCode_indexed=0.0, QuadClass_indexed=0.0, ActionGeo_Type_indexed_encoded=SparseVector(6, {0: 1.0}), EventRootCode_indexed_encoded=SparseVector(21, {0: 1.0}), QuadClass_indexed_encoded=SparseVector(4, {0: 1.0}), features=SparseVector(33, {0: 1.0, 6: 1.0, 27: 1.0, 31: 10.0, 32: 10.0}))]

In [24]:
# Transform data
final_data = transformed.select('features', 'AvgToneBin')

In [25]:
# Initialize the classification models
dtc = DecisionTreeClassifier(labelCol='AvgToneBin', featuresCol='features')
#rfc = RandomForestClassifier(numTrees=50, labelCol='AvgToneBi', featuresCol='features')
#gbt = GBTClassifier(labelCol='AvgToneBi', featuresCol='features', maxIter=10)

In [26]:
# We will perform a classic 80/20 split between training and testing data.
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [27]:
# Training the Models
dtc_model = dtc.fit(train_data)

In [28]:
# Obtaining Predictions
dtc_preds = dtc_model.transform(test_data)
#rfc_preds = rfc_model.transform(test_data)
#gbt_preds = gbt_model.transform(test_data)

In [29]:
# Evaluating Model’s Performance
## Our evaluator will be the ROC. We will initialize its class and pass it the predicitons in order to obtain the value.
my_eval = BinaryClassificationEvaluator(labelCol='AvgToneBin')

In [30]:
# Display Decision Tree evaluation metric
print('DTC')
print(my_eval.evaluate(dtc_preds))

DTC
0.5057174142351178


In [52]:
dtc_model.write().save('s3://miaowang1009/ANLY502_Final/model_dt')

In [38]:
from io import StringIO
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz

dot_data = StringIO()
export_graphviz(dtc_model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names=['0','1','2','3','4','5','6','7','8','9','10'], 
                class_names = ['0','1','2','3','4','5','6','7','8','9','10'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph05.write_png("dtree_5_best.png")
Image(graph.create_png())   

TypeError: DecisionTreeClassificationModel (uid=DecisionTreeClassifier_18396dd71f57) of depth 5 with 11 nodes is not an estimator instance.

In [None]:
dtc_model.