In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

In [73]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, RandomForestClassificationSummary, RandomForestClassificationModel, NaiveBayes, GBTClassifier,FMClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
spark = SparkSession.builder.appName("Travel").config("spark.jars.packages","com.databricks:spark-xml_2.12:0.17.0").getOrCreate()

3 data frames from Stack Travel

In [5]:
df_posts = spark.read.format("com.databricks.spark.xml").option("rowTag", "row").option("rootTag", "posts").load("../data/posts.xml")
df_users = spark.read.format("com.databricks.spark.xml").option("rowTag", "row").option("rootTag", "users").load("../data/users.xml")
df_tags = spark.read.format("com.databricks.spark.xml").option("rowTag", "row").option("rootTag", "tags").load("../data/tags.xml")

Function if NaN return 0, if not NaN return 1

In [6]:
@udf(returnType='int')
def if_NaN(x):
    if x is not None:
        return 1
    else:
        return 0

Function counting length of string

In [7]:
@udf(returnType='int') 
def slen(s):
    return len(s)

Creating functions finding 5 popular tag in _Tags

In [8]:
@udf(returnType='int')
def if_Tag1(x):
    tag = "<" + popular_tags[0][0] + ">"
    if tag in x:
        return 1
    else:
        return 0
@udf(returnType='int')
def if_Tag2(x):
    tag = "<" + popular_tags[1][0] + ">"
    if tag in x:
        return 1
    else:
        return 0

@udf(returnType='int')
def if_Tag3(x):
    tag = "<" + popular_tags[2][0] + ">"
    if tag in x:
        return 1
    else:
        return 0

@udf(returnType='int')
def if_Tag4(x):
    tag = "<" + popular_tags[3][0] + ">"
    if tag in x:
        return 1
    else:
        return 0

@udf(returnType='int')
def if_Tag5(x):
    tag = "<" + popular_tags[4][0] + ">"
    if tag in x:
        return 1
    else:
        return 0

Finding most popular tags from Tags table

In [9]:
popular_tags = df_tags.select(df_tags._TagName).sort("_Count", ascending = False).limit(5).collect()

Upgraded data frame Posts:
- questions only (PostTypeId = 1)
- boolean AcceptedAnswerExist (1 if exist, 0 if NaN)
- length of Body and Title
- boolean IfClosed (1 if question is closed, 0 if NaT - not closed yet)
- new columns (bool) defining if question has accurate tag

In [10]:
df_posts_updated = df_posts.select(if_NaN("_AcceptedAnswerId").alias("AcceptedAnswerExist"), df_posts._AnswerCount,
slen("_Body").alias("BodyLen"), df_posts._CommentCount, df_posts._CreationDate, if_NaN("_ClosedDate").alias("IfClosed"),
df_posts._Id, df_posts._OwnerUserId, df_posts._PostTypeId, df_posts._Score, df_posts._Tags, slen("_Title").alias("TitleLen"),
df_posts._ViewCount) \
.withColumn("TagVisas",if_Tag1("_Tags")) \
.withColumn("TagUsa", if_Tag2("_Tags")) \
.withColumn("TagUk", if_Tag3("_Tags")) \
.withColumn("TagAir-travel", if_Tag4("_Tags")) \
.withColumn("TagCustoms-and-immigration", if_Tag5("_Tags")) \
.filter("_PostTypeId == 1") \
.filter("_OwnerUserId NOT LIKE 'NaN' ")

Upgraded data frame Users:
- columns Id, Location, UpVotes and Views only taken

In [11]:
df_users_new = df_users.select("_Id", "_Location", "_UpVotes", "_Views")

Joining table posts and users

In [12]:
df_main = df_posts_updated.join(df_users_new, df_posts_updated._OwnerUserId == df_users_new._Id, 'left_outer')

### New prepared data frame

In [13]:
df_main.limit(5).toPandas()

Unnamed: 0,AcceptedAnswerExist,_AnswerCount,BodyLen,_CommentCount,_CreationDate,IfClosed,_Id,_OwnerUserId,_PostTypeId,_Score,...,_ViewCount,TagVisas,TagUsa,TagUk,TagAir-travel,TagCustoms-and-immigration,_Id.1,_Location,_UpVotes,_Views
0,1,4,344,4,2011-06-21 22:19:34.730,1,1,9,1,8,...,627,0,0,0,0,0,9,"Toronto, Canada",121,60
1,0,8,579,4,2011-06-21 22:22:33.760,0,2,13,1,43,...,3659,0,0,0,0,0,13,"New York, NY, United States",98,192
2,1,5,348,0,2011-06-21 22:25:56.787,0,5,13,1,14,...,520,0,0,0,0,0,13,"New York, NY, United States",98,192
3,1,6,274,9,2011-06-21 22:26:53.323,0,6,19,1,89,...,8093,0,0,0,0,0,19,,6663,2412
4,0,1,173,1,2011-06-21 22:24:57.160,0,4,24,1,8,...,301,0,0,0,0,0,24,"New York, NY",20,71


In [14]:
df_classification = df_main.drop("_CreationDate","_Id","_OwnerUserId","_PostTypeId","_Tags","_Location")

In [17]:
df_classification.limit(5).show()

+-------------------+------------+-------+-------------+--------+------+--------+----------+--------+------+-----+-------------+--------------------------+--------+------+
|AcceptedAnswerExist|_AnswerCount|BodyLen|_CommentCount|IfClosed|_Score|TitleLen|_ViewCount|TagVisas|TagUsa|TagUk|TagAir-travel|TagCustoms-and-immigration|_UpVotes|_Views|
+-------------------+------------+-------+-------------+--------+------+--------+----------+--------+------+-----+-------------+--------------------------+--------+------+
|                  1|           4|    344|            4|       1|     8|      44|       627|       0|     0|    0|            0|                         0|     121|    60|
|                  0|           8|    579|            4|       0|    43|      73|      3659|       0|     0|    0|            0|                         0|      98|   192|
|                  1|           5|    348|            0|       0|    14|      77|       520|       0|     0|    0|            0|            

## Preoprocessing

In [67]:
#Train/Test split
train, test = df_classification.randomSplit([0.8, 0.2], seed=12345)

In [19]:
#Columns normalization

columns_to_normalize = ["_AnswerCount", "_CommentCount", "BodyLen", "_Score", "TitleLen", "_ViewCount", "_UpVotes", "_Views"]
num_assembler = VectorAssembler(inputCols=columns_to_normalize, outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_features")

#Merging with other columns

final_columns = ["scaled_features", "TagVisas", "TagUsa", "TagUk", "TagAir-travel", "TagCustoms-and-immigration", "IfClosed"]
final_assembler = VectorAssembler(inputCols=final_columns, outputCol="final_features")

In [81]:
def buildAndEvaluateModel(classifierType, df, *args, **kwargs):
    classifier = classifierType(featuresCol="final_features", labelCol="AcceptedAnswerExist", *args, **kwargs)
    pipeline = Pipeline(stages=[num_assembler, scaler, final_assembler, classifier])
    
    binaryEvaluator = BinaryClassificationEvaluator(labelCol="AcceptedAnswerExist", rawPredictionCol="rawPrediction",
                                                    metricName="areaUnderROC")
    multiEvaluator = MulticlassClassificationEvaluator(labelCol="AcceptedAnswerExist", predictionCol="prediction",
                                                       metricName="accuracy")
    paramGrid = ParamGridBuilder().build() 
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=multiEvaluator,  
                              numFolds=5) 

    cvModel = crossval.fit(df)
    predictions = cvModel.transform(test)
    
    auc = binaryEvaluator.evaluate(predictions)
    print("AUC:", auc)
    accuracy = multiEvaluator.evaluate(predictions)
    print("Skuteczność:", accuracy)
    predictionAndLabels = predictions.select("prediction", "AcceptedAnswerExist").rdd.map(
        lambda row: (float(row[0]), float(row[1])))
    metrics = MulticlassMetrics(predictionAndLabels)
    confusionMatrix = metrics.confusionMatrix().toArray()
    print("Macierz pomyłek:\n", confusionMatrix)
    
    TP = confusionMatrix[0, 0]
    FN = confusionMatrix[0, 1]
    FP = confusionMatrix[1, 0]
    TN = confusionMatrix[1, 1]
    
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
    
    informedness = TPR + TNR - 1
    print("Informedness:\n", informedness)

    return {"accuracy": accuracy, "AUC": auc, "confusionMatrix": confusionMatrix, "Informedness": informedness}

In [86]:
def simulations(model_types, df, results=None, *args, **kwargs):
    if results is None:
        results = {}
    for i in model_types:
        name = i.__name__
        print("\n\nWyniki dla "+ name)
        try:
            results[name] = buildAndEvaluateModel(i, df,  *args, **kwargs)
        except:
            print("Błąd dla " + i)
    return results

In [87]:
model_types = [LogisticRegression, FMClassifier, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier]
results_of_simulations = simulations(model_types, df_classification)



Wyniki dla LogisticRegression


TypeError: can only concatenate str (not "ABCMeta") to str

In [20]:
#Logistic Regression
lr = LogisticRegression(featuresCol="final_features", labelCol="AcceptedAnswerExist")
pipeline_regLog = Pipeline(stages=[num_assembler, scaler, final_assembler, lr])
model_regLog = pipeline_regLog.fit(train)

In [23]:
#Logistic Regression Evaluation

predictions = model_regLog.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="AcceptedAnswerExist", rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print("AUC:", auc)

evaluator = MulticlassClassificationEvaluator(labelCol="AcceptedAnswerExist", predictionCol="prediction",
                                          metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

predictionAndLabels = predictions.select("prediction", "AcceptedAnswerExist").rdd.map(
    lambda row: (float(row[0]), float(row[1])))

metrics = MulticlassMetrics(predictionAndLabels)
confusionMatrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusionMatrix)

AUC: 0.7511411013033443
Accuracy: 0.6985458019318543
Confusion Matrix:
 [[5076.  716.]
 [2124. 1505.]]


In [24]:
#Decision Tree
dc = DecisionTreeClassifier(featuresCol="final_features", labelCol="AcceptedAnswerExist")
pipeline_decisionTree = Pipeline(stages=[num_assembler, scaler, final_assembler, dc])
model_decisionTree = pipeline_decisionTree.fit(train)

In [25]:
#Decision Tree Evaluation

predictions = model_decisionTree.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="AcceptedAnswerExist", rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print("AUC:", auc)

evaluator = MulticlassClassificationEvaluator(labelCol="AcceptedAnswerExist", predictionCol="prediction",
                                          metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

predictionAndLabels = predictions.select("prediction", "AcceptedAnswerExist").rdd.map(
    lambda row: (float(row[0]), float(row[1])))

metrics = MulticlassMetrics(predictionAndLabels)
confusionMatrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusionMatrix)

AUC: 0.7361625350727489
Accuracy: 0.7891943530410784
Confusion Matrix:
 [[4328. 1464.]
 [ 522. 3107.]]


Joinować tylko to co potrzebujemy, żeby nie było duplikatów
Kolumna z najpolpularniejszym tagiem i najmniej popularnym

In [84]:
#Random Forest
dc = FMClassifier(featuresCol="final_features", labelCol="AcceptedAnswerExist")
pipeline_randomForest = Pipeline(stages=[num_assembler, scaler, final_assembler, dc])
model_randomForest = pipeline_randomForest.fit(train)

In [85]:
#Random Forest Evaluation

predictions = model_randomForest.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="AcceptedAnswerExist", rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print("AUC:", auc)

evaluator = MulticlassClassificationEvaluator(labelCol="AcceptedAnswerExist", predictionCol="prediction",
                                          metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

predictionAndLabels = predictions.select("prediction", "AcceptedAnswerExist").rdd.map(
    lambda row: (float(row[0]), float(row[1])))

metrics = MulticlassMetrics(predictionAndLabels)
confusionMatrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusionMatrix)

AUC: 0.7607375325226954
Accuracy: 0.7067190319498992
Confusion Matrix:
 [[4896.  896.]
 [1867. 1762.]]


In [69]:
#Gradient Boost Trees
gbt = GBTClassifier(featuresCol="final_features", labelCol="AcceptedAnswerExist", maxIter=10)
pipeline_gbt = Pipeline(stages=[num_assembler, scaler, final_assembler, gbt])
model_gbt = pipeline_gbt.fit(train)

In [70]:
#Gradient Boost Trees Evaluation

predictions = model_gbt.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="AcceptedAnswerExist", rawPredictionCol="rawPrediction",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print("AUC:", auc)

evaluator = MulticlassClassificationEvaluator(labelCol="AcceptedAnswerExist", predictionCol="prediction",
                                          metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

predictionAndLabels = predictions.select("prediction", "AcceptedAnswerExist").rdd.map(
    lambda row: (float(row[0]), float(row[1])))

metrics = MulticlassMetrics(predictionAndLabels)
confusionMatrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusionMatrix)

AUC: 0.8592415265913481
Accuracy: 0.7934401868166862
Confusion Matrix:
 [[4390. 1402.]
 [ 544. 3085.]]
