In [115]:
# https://github.com/tthustla/pyspark_sa_gcp/blob/master/pyspark_sa.py
import pandas as pd
import random
import sys
import pyspark as ps
import warnings
import re
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, NGram, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import PipelineModel

In [116]:
#define regex pattern for preprocessing
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
input_test="gs://cloud-project-bucket-3/batman_col_names.csv"

In [117]:
# preprocessing for
# first_process: to remove Twitter handle and URL
# second_process: to remove URL pattern starting with www.
# third_process: to lower characters
# fourth_process: to replace contracted negation with proper forms
# result: remove numbers and special characters
def pre_processing(column):
    first_process = re.sub(combined_pat, '', column)
    second_process = re.sub(www_pat, '', first_process)
    third_process = second_process.lower()
    fourth_process = neg_pattern.sub(lambda x: negations_dic[x.group()], third_process)
    result = re.sub(r'[^A-Za-z ]','',fourth_process)
    return result.strip()

In [118]:
def save_predictions(predictions):
    df_twtr = pd.read_csv(input_test)
    #print(df_twtr.head(5))
    # append predictions to dataframe
    df_tweet_preds = df_twtr.copy()
    #df_tweet_preds['predictions'] = predictions['prediction']
    #print(df_tweet_preds.shape)
    test = predictions.sample(False, .25, 42)
    df_first_10 = test[['text', 'prediction']].head(10)
    print(df_first_10)
    
    

In [119]:
# below main function can be use for either first training or getting predictions with a loaded model
# first retrieve data
# apply pre-processing by making the above defined pre_processing function to a user defined function
# either build the pipeline from the above build_pipeline function and train or use a loaded pipeline model
# make predictions on the test set
# output the pipeline model, Spark dataframe of the predictions, and the prediction accuracy on the test set

def main(sqlc,input_dir,loaded_model):
    print('retrieving data from {}'.format(input_dir))
    model = loaded_model
    print('preprocessing data...')
    reg_replaceUdf = f.udf(pre_processing, t.StringType())
    test_set = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(input_test)
    test_set = test_set.withColumn('tweet', reg_replaceUdf(f.col('text')))
    print('making predictions...')
    predictions = model.transform(test_set)
    # accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_set.count())
    return model, predictions

In [120]:
if __name__=="__main__":
    inputdir = "gs://cloud-project-bucket-3"
    modeldir = "gs://cloud-project-bucket-3/modeldir"
    outputfile = "gs://cloud-project-bucket-3/outputfile.csv"
    
    # create a SparkContext while checking if there is already SparkContext created
    try:
        sc = ps.SparkContext()
        sc.setLogLevel("ERROR")
        sqlContext = ps.sql.SQLContext(sc)
        print('Created a SparkContext')
    except ValueError:
        warnings.warn('SparkContext already exists in this scope')
    # build pipeline, fit the model and retrieve the outputs by running main() function
    loadedModel = PipelineModel.load(modeldir)
    pipelineFit, predictions = main(sqlContext,inputdir,loadedModel)
    save_predictions(predictions)
    print('predictions finished!')
    # print('accuracy on test data is {}'.format(accuracy))
    # select the original target label 'sentiment', 'text' and 'label' created by label_stringIdx in the pipeline
    # model predictions. Save it as a single CSV file to a destination specified by the second command line argument
    # print('saving predictions to {}'.format(outputfile))
    # predictions.select(predictions['sentiment'],predictions['text'],predictions['label'],predictions['prediction']).coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv(outputfile)
    # save the trained model to destination specified by the third command line argument
    # print('saving model to {}'.format(modeldir))
    # pipelineFit.save(modeldir)
    # Load the saved model and make another predictions on the same test set
    # to check if the model was properly saved
    #_, loaded_accuracy = main(sqlContext,inputdir,loadedModel)
    # print('accuracy with saved model on test data is {}'.format(loaded_accuracy))
    sc.stop()

                                                                                

retrieving data from gs://cloud-project-bucket-3
preprocessing data...
making predictions...


22/03/08 20:57:19 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column sentiment does not exist during transformation. Skip StringIndexerModel for this column.
                                                                                

[Row(text='trent tells his fever dream about what the batman plot and the boys are also back for another week bone crushing recommendations for metal march drink the week pleasure chest ipa playalinda brewing company clown shoes undead party crasher heineken light', prediction=0.0), Row(text='you were terrific both movies you are real life ', prediction=0.0), Row(text='the batman 2022  ', prediction=0.0), Row(text='won’ happen all can tell won’ you can’ have batman movies the same time batfleck finish done ’ not coming back after the flash movie', prediction=1.0), Row(text='’ like see batman movie which someone who sees batman says “hey that guy dressed like batman like from the movies ”', prediction=0.0), Row(text='don’ understand how people like ben affleck’ batman ', prediction=0.0), Row(text='loved the fact that they emphasized the batman being detective pwede ulit kumain sinehan haha', prediction=0.0), Row(text='who the fuck said the batman was better than the dark knight ’all ain

In [121]:
sc.stop()