# Main script to call functions from other scripts

In [None]:
# Import libraries

In [9]:
# Initialize other notebooks
# ------------ Initialize preprocessing script
%run "preprocessing.ipynb"

In [10]:
wiki_df = get_wiki_df()
get_label_count(wiki_df)

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)

+------+--------+
| label|count(1)|
+------+--------+
|  safe|   30333|
|unsafe|    4136|
|vandal|     270|
+------+--------+



## Full pipeline
Extracting edited part --> Lemmatization, stemming and removing stopwords --> TF-IDF --> Feed to model --> Return prediction

In [14]:
globals()['models_loaded'] = False

# ------------ Replace this with a trained ML model
def predict(df):
    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
        return 'vandal'
    else:
        return 'safe'
# ------------ Replace this with a trained ML model

predict_udf = udf(predict, StringType())

def process(time, rdd):
    """""""""
    This function will serve as the construction pipeline.
    Here all preprocessing functions need to be called and 
    need to be able to handle one wiki edit.
    """""""""
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    
    # Apply make_diff function
    diff = make_diff(df.first().text_old, df.first().text_new)
    df_withdiff = df.withColumn("diff", lit(diff))
    df_withdiff.select('diff').show()
    
#     # Utilize our predict function
#     df_withpreds = df_withdiff.withColumn("pred", predict_udf(
#         struct([df_withdiff[x] for x in df_withdiff.columns])
#     ))
#     df_withpreds.show()
    
#     # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
#     # But an MLlib model you've built and saved with Spark
#     # In this case, you need to prevent loading your model in every call to "process" as follows:
    
#     # Load in the model if not yet loaded:
#     if not globals()['models_loaded']:
#         # load in your models here
#         globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
#         globals()['models_loaded'] = True
        
#     # And then predict using the loaded model: 
#     # df_result = globals()['my_model'].transform(df)
#     # df_result.show()

## Begin online stream and make prediction per wiki edit

In [15]:
# Start the prediction task
ssc = StreamingContext(sc, 10) # ---------------------> Get minibatches every 10 seconds
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process) # --------------------------> Perform the function 'process' on every RDD
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+-----------+--------------------+--------------------+-------------+--------------------+
|             comment|label|  name_user|            text_new|            text_old|   title_page|            url_page|
+--------------------+-----+-----------+--------------------+--------------------+-------------+--------------------+
|clean up, rm extr...| safe|Kaltenmeyer|{{about|the compa...|{{about|the compa...|Faber-Castell|//en.wikipedia.or...|
+--------------------+-----+-----------+--------------------+--------------------+-------------+--------------------+

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-File...|
+--------------------+

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-

In [16]:
# Stop the prediction task
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+------+--------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|     name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------+--------------------+--------------------+--------------------+--------------------+
|→‎Google Translat...|  safe|  Oneillge2029|{{short descripti...|{{short descripti...|List of Google Ap...|//en.wikipedia.or...|
|                    |unsafe|103.251.143.82|{{short descripti...|{{short descripti...|         Kuiper belt|//en.wikipedia.or...|
|       Remove spaces|  safe|    Anypodetos|{{about||other us...|{{about||other us...|              Gadget|//en.wikipedia.or...|
|                    |  safe|  Marquardtika|{{NPOV|date=March...|{{NPOV|date=March...|    Charlotte Clymer|//en.wikipedia.or...|
+--------------------+------+--------------+-