# Twitter Sentiment Analysis
## Big Data

#### Brian Morris
#### December 2022

In [1]:
# Install textblob if not installed
!pip install textblob

In [0]:
from textblob import TextBlob
import pyspark.sql.functions as F
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.types import TimestampType, IntegerType, FloatType, StringType, StructType, StructField
from pyspark.sql import SparkSession

# Machine learning
from pyspark.ml.feature import NGram, VectorAssembler, StopWordsRemover, HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline


Helper UDF, mount a bucket for file access

In [0]:
def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
  ACCESS_KEY_ID = access_key
  SECRET_ACCESS_KEY = secret_key
  ENCODED_SECRET_KEY = SECRET_ACCESS_KEY.replace("/", "%2F")

  print ("Mounting", bucket_name)

  try:
    # Unmount the data in case it was already mounted.
    dbutils.fs.unmount("/mnt/%s" % mount_folder)
    
  except:
    # If it fails to unmount it most likely wasn't mounted in the first place
    print ("Directory not unmounted: ", mount_folder)
    
  finally:
    # Lastly, mount our bucket.
    dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY_ID, ENCODED_SECRET_KEY, bucket_name), "/mnt/%s" % mount_folder)
    #dbutils.fs.mount("s3a://"+ ACCESS_KEY_ID + ":" + ENCODED_SECRET_KEY + "@" + bucket_name, mount_folder)
    print ("The bucket", bucket_name, "was mounted to", mount_folder, "\n")
    

Set access key variables

In [0]:
# Set AWS programmatic access credentials
ACCESS_KEY = "AKIAX6IZEAAAA4DMQOW2"
SECRET_ACCESS_KEY = "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"

Mount the WCD bucket, specifically twitter folder data, and name it topics

In [0]:
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'weclouddata/twitter', 'topics')

Mounting weclouddata/twitter
/mnt/topics has been unmounted.
The bucket weclouddata/twitter was mounted to topics 



Explore the different topics that were scraped

In [0]:
%fs ls /mnt/topics/

path,name,size,modificationTime
dbfs:/mnt/topics/AI/,AI/,0,0
dbfs:/mnt/topics/BankofCanada/,BankofCanada/,0,0
dbfs:/mnt/topics/BlackFriday/,BlackFriday/,0,0
dbfs:/mnt/topics/CERB/,CERB/,0,0
dbfs:/mnt/topics/CSIS/,CSIS/,0,0
dbfs:/mnt/topics/CanadaHousing/,CanadaHousing/,0,0
dbfs:/mnt/topics/ElonMusk/,ElonMusk/,0,0
dbfs:/mnt/topics/Flames/,Flames/,0,0
dbfs:/mnt/topics/Inflation/,Inflation/,0,0
dbfs:/mnt/topics/Interest_rate/,Interest_rate/,0,0


Preview one of the folder contents

In [0]:
%fs ls /mnt/topics/AI/2022/12/08/18

path,name,size,modificationTime
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-24-27-2be53775-fbca-36fb-8bab-a9e84803a793,topic4-2-2022-12-08-18-24-27-2be53775-fbca-36fb-8bab-a9e84803a793,7676,1670524170000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-29-11-6b698cf3-971d-3c2f-a0c0-ae88d5c8b9cd,topic4-2-2022-12-08-18-29-11-6b698cf3-971d-3c2f-a0c0-ae88d5c8b9cd,16486,1670524454000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-34-07-612ecb35-016e-3cac-8d5d-c89e4129774c,topic4-2-2022-12-08-18-34-07-612ecb35-016e-3cac-8d5d-c89e4129774c,12526,1670524750000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-38-58-93f597ec-f4af-3ba8-9050-ac4db971fd62,topic4-2-2022-12-08-18-38-58-93f597ec-f4af-3ba8-9050-ac4db971fd62,8227,1670525040000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-43-42-8264e752-504a-3e57-b227-055bedd8b93a,topic4-2-2022-12-08-18-43-42-8264e752-504a-3e57-b227-055bedd8b93a,11783,1670525324000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-48-49-c40fe1fa-ace1-30cf-8c3d-56376756e989,topic4-2-2022-12-08-18-48-49-c40fe1fa-ace1-30cf-8c3d-56376756e989,9315,1670525632000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-53-38-62d1eb45-6705-3cb3-8b91-f5bcff482f8d,topic4-2-2022-12-08-18-53-38-62d1eb45-6705-3cb3-8b91-f5bcff482f8d,8345,1670525921000
dbfs:/mnt/topics/AI/2022/12/08/18/topic4-2-2022-12-08-18-58-35-25aee653-cb0c-3580-aab1-02af2820cd6b,topic4-2-2022-12-08-18-58-35-25aee653-cb0c-3580-aab1-02af2820cd6b,8446,1670526219000


Grab all files from all folders for the scraped AI topic

In [0]:
# 5 starts needed to get all 
path = '/mnt/topics/AI/*/*/*/*/*'

Create a Spark session

In [0]:
spark = (SparkSession
        .builder
        .appName('dataProject')
        .getOrCreate()
        )
print('Session created')
sc = spark.sparkContext

Session created


Build the table schema for the scraped data (tweets and supporting data)

In [0]:
# Create my schema
schema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('screen_name', StringType(), True),
    StructField('tweet', StringType(), True),
    StructField('followers_count', StringType(), True),
    StructField('location', StringType(), True),
    StructField('geo', StringType(), True),
    StructField('created_at', StringType(), True)
])

Read the tweets data into a Spark dataframe

In [0]:
df = (spark
     .read
     .option('header', 'false')
     .option('delimiter','\t')
     .schema(schema)
      .csv(path)
     )

In [0]:
type(df)

Out[11]: pyspark.sql.dataframe.DataFrame

In [0]:
# Get shape of df
print(df.count(), len(df.columns))

10497 8


In [0]:
df.show(10)

+-------------------+-------------+-----------+--------------------+---------------+--------------------+----+--------------------+
|                 id|         name|screen_name|               tweet|followers_count|            location| geo|          created_at|
+-------------------+-------------+-----------+--------------------+---------------+--------------------+----+--------------------+
|1601172099045158912| YUNUS HANBAL|HanbalYunus|@CryptoEmdarks Th...|             21|                None|None|Fri Dec 09 11:09:...|
|1601172122730041344|  ASLI HANBAL| HanbalAsli|@Crypto__Diva #GP...|            152|                None|None|Fri Dec 09 11:09:...|
|1601172161372491778| YUNUS HANBAL|HanbalYunus|@CryptoThro There...|             21|                None|None|Fri Dec 09 11:09:...|
|1601172171602419712|  ASLI HANBAL| HanbalAsli|@belufrancese #GP...|            152|                None|None|Fri Dec 09 11:09:...|
|1601172214056767489| YUNUS HANBAL|HanbalYunus|@cryptojack There...|        

Mount project bucket

In [0]:
# Mount your own bucket
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'b16-brian/dataproject', 'project_bucket')

Mounting b16-brian/dataproject
/mnt/project_bucket has been unmounted.
The bucket b16-brian/dataproject was mounted to project_bucket 



Write the mounted files to project bucket

In [0]:
(df
.write
.option('header', 'false')
.option('delimiter', '\t')
.csv('/mnt/project_bucket/AI.csv'))

Begin to explore the data and prepare for the machine learning model

In [0]:
# cache the dataframe for faster iteration
df.cache()

Out[16]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: string, location: string, geo: string, created_at: string]

Clean up the tweet information before getting sentiment label

In [0]:
df_clean = df.withColumn('tweet', F.regexp_replace('tweet', r"http\S+", "")) \
            .withColumn('tweet', F.regexp_replace('tweet', r"[^a-zA-z]", " ")) \
            .withColumn('tweet', F.regexp_replace('tweet', r"\s+", " ")) \
            .withColumn('tweet', F.lower('tweet')) \
            .withColumn('tweet', F.trim('tweet')) 
df_clean.show(10)

+-------------------+-------------+-----------+--------------------+---------------+--------------------+----+--------------------+
|                 id|         name|screen_name|               tweet|followers_count|            location| geo|          created_at|
+-------------------+-------------+-----------+--------------------+---------------+--------------------+----+--------------------+
|1601172099045158912| YUNUS HANBAL|HanbalYunus|cryptoemdarks the...|             21|                None|None|Fri Dec 09 11:09:...|
|1601172122730041344|  ASLI HANBAL| HanbalAsli|crypto__diva gple...|            152|                None|None|Fri Dec 09 11:09:...|
|1601172161372491778| YUNUS HANBAL|HanbalYunus|cryptothro there ...|             21|                None|None|Fri Dec 09 11:09:...|
|1601172171602419712|  ASLI HANBAL| HanbalAsli|belufrancese gple...|            152|                None|None|Fri Dec 09 11:09:...|
|1601172214056767489| YUNUS HANBAL|HanbalYunus|cryptojack there ...|        

In [0]:
display(df_clean)

id,name,screen_name,tweet,followers_count,location,geo,created_at
1601172099045158912,YUNUS HANBAL,HanbalYunus,cryptoemdarks there are many innovations and surprises in the future project dxgm metaverse universe you can par,21.0,,,Fri Dec 09 11:09:11 +0000 2022
1601172122730041344,ASLI HANBAL,HanbalAsli,crypto__diva gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be,152.0,,,Fri Dec 09 11:09:17 +0000 2022
1601172161372491778,YUNUS HANBAL,HanbalYunus,cryptothro there are many innovations and surprises in the future project dxgm metaverse universe you can partic,21.0,,,Fri Dec 09 11:09:26 +0000 2022
1601172171602419712,ASLI HANBAL,HanbalAsli,belufrancese gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be,152.0,,,Fri Dec 09 11:09:28 +0000 2022
1601172214056767489,YUNUS HANBAL,HanbalYunus,cryptojack there are many innovations and surprises in the future project dxgm metaverse universe you can partic,21.0,,,Fri Dec 09 11:09:38 +0000 2022
1601172226631311362,ASLI HANBAL,HanbalAsli,cryptothro gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be d,152.0,,,Fri Dec 09 11:09:41 +0000 2022
1601172266460454914,ASLI HANBAL,HanbalAsli,cryptoworld gplex with blockchain technology in the gaming world with the unique metaverse world waiting to,152.0,,,Fri Dec 09 11:09:51 +0000 2022
1601172313017581568,ASLI HANBAL,HanbalAsli,pascualprincipe gplex with blockchain technology in the gaming world with the unique metaverse world waiting to,152.0,,,Fri Dec 09 11:10:02 +0000 2022
1601172334810783744,YUNUS HANBAL,HanbalYunus,riccardogems there are many innovations and surprises in the future project dxgm metaverse universe you can part,21.0,,,Fri Dec 09 11:10:07 +0000 2022
1601172340188254208,Space ☆ Bruce,spacebruce,the vf phoenix variable fighter was an prototype for the vf valkyrie it served in as a front line fighter,952.0,68000 HEART ON FIRE /🔞 please,,Fri Dec 09 11:10:08 +0000 2022


In [0]:
# cache the dataframe for faster iteration
df_clean.cache()

Out[23]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: string, location: string, geo: string, created_at: string]

Drop any empty tweets, and missing timestamp rows

In [0]:
df_clean = df_clean.na.drop(subset=['tweet', 'created_at'])

In [0]:
df_clean.count()

Out[25]: 10457

## Get sentiment labels

In [0]:
# Create a UDF which will label our tweets using TextBlob

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment >= 0:
        return 'positive'
    else:
        return 'negative'

In [0]:
# Set up the Spark UDF
sentiment_score = F.udf(lambda x: get_sentiment(x))

In [0]:
#df_sent = df_clean.select('*', sentiment_score('tweet').alias('sentiment')) # alternative way of doing the same thing as below

# Apply the UDF to the tweets and get a new column called 'sentiment' with the label
df_sent = df_clean.withColumn('sentiment', sentiment_score('tweet'))

In [0]:
df_sent.show(20)

+-------------------+-------------+--------------+--------------------+---------------+--------------------+----+--------------------+---------+
|                 id|         name|   screen_name|               tweet|followers_count|            location| geo|          created_at|sentiment|
+-------------------+-------------+--------------+--------------------+---------------+--------------------+----+--------------------+---------+
|1601172099045158912| YUNUS HANBAL|   HanbalYunus|cryptoemdarks the...|             21|                None|None|Fri Dec 09 11:09:...| positive|
|1601172122730041344|  ASLI HANBAL|    HanbalAsli|crypto__diva gple...|            152|                None|None|Fri Dec 09 11:09:...| positive|
|1601172161372491778| YUNUS HANBAL|   HanbalYunus|cryptothro there ...|             21|                None|None|Fri Dec 09 11:09:...| positive|
|1601172171602419712|  ASLI HANBAL|    HanbalAsli|belufrancese gple...|            152|                None|None|Fri Dec 09 11:09:

In [0]:
df_sent.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive| 5165|
| negative| 5292|
+---------+-----+



In [0]:
df_sent.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- followers_count: string (nullable = true)
 |-- location: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [0]:
# df_sent shape
print(df_sent.count(), len(df_sent.columns))

10457 9


In [0]:
df_sent.summary().show()

+-------+--------------------+--------------------+-------------------+--------------------+------------------+--------------------+--------------------+--------------------+---------+
|summary|                  id|                name|        screen_name|               tweet|   followers_count|            location|                 geo|          created_at|sentiment|
+-------+--------------------+--------------------+-------------------+--------------------+------------------+--------------------+--------------------+--------------------+---------+
|  count|               10457|               10457|              10457|               10457|             10457|               10457|               10457|               10457|    10457|
|   mean|1.601106867096045...|             30871.0|       2.00006535E7|                null|19078.221860954385|1.594896331738580...|                null|                null|     null|
| stddev|1.114409404797411...|   61723.33420460477|2.828476692931551E7|    

In [0]:
df_sent.show(5)

+-------------------+------------+-----------+--------------------+---------------+--------+----+--------------------+---------+
|                 id|        name|screen_name|               tweet|followers_count|location| geo|          created_at|sentiment|
+-------------------+------------+-----------+--------------------+---------------+--------+----+--------------------+---------+
|1601172099045158912|YUNUS HANBAL|HanbalYunus|cryptoemdarks the...|             21|    None|None|Fri Dec 09 11:09:...| positive|
|1601172122730041344| ASLI HANBAL| HanbalAsli|crypto__diva gple...|            152|    None|None|Fri Dec 09 11:09:...| positive|
|1601172161372491778|YUNUS HANBAL|HanbalYunus|cryptothro there ...|             21|    None|None|Fri Dec 09 11:09:...| positive|
|1601172171602419712| ASLI HANBAL| HanbalAsli|belufrancese gple...|            152|    None|None|Fri Dec 09 11:09:...| positive|
|1601172214056767489|YUNUS HANBAL|HanbalYunus|cryptojack there ...|             21|    None|None|

Add tokenized and stopword filtered columns to the dataframe

In [0]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
df_sent_tokens = tokenizer.transform(df_sent)

#now remove stopwords from the review(list of words)    
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
df_sent_stopwords = stopword_remover.transform(df_sent_tokens)

display(df_sent_stopwords)

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens,filtered
1601172099045158912,YUNUS HANBAL,HanbalYunus,cryptoemdarks there are many innovations and surprises in the future project dxgm metaverse universe you can par,21,,,Fri Dec 09 11:09:11 +0000 2022,positive,"List(cryptoemdarks, there, are, many, innovations, and, surprises, in, the, future, project, dxgm, metaverse, universe, you, can, par)","List(cryptoemdarks, many, innovations, surprises, future, project, dxgm, metaverse, universe, par)"
1601172122730041344,ASLI HANBAL,HanbalAsli,crypto__diva gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be,152,,,Fri Dec 09 11:09:17 +0000 2022,positive,"List(crypto__diva, gplex, with, blockchain, technology, in, the, gaming, world, with, the, unique, metaverse, world, waiting, to, be)","List(crypto__diva, gplex, blockchain, technology, gaming, world, unique, metaverse, world, waiting)"
1601172161372491778,YUNUS HANBAL,HanbalYunus,cryptothro there are many innovations and surprises in the future project dxgm metaverse universe you can partic,21,,,Fri Dec 09 11:09:26 +0000 2022,positive,"List(cryptothro, there, are, many, innovations, and, surprises, in, the, future, project, dxgm, metaverse, universe, you, can, partic)","List(cryptothro, many, innovations, surprises, future, project, dxgm, metaverse, universe, partic)"
1601172171602419712,ASLI HANBAL,HanbalAsli,belufrancese gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be,152,,,Fri Dec 09 11:09:28 +0000 2022,positive,"List(belufrancese, gplex, with, blockchain, technology, in, the, gaming, world, with, the, unique, metaverse, world, waiting, to, be)","List(belufrancese, gplex, blockchain, technology, gaming, world, unique, metaverse, world, waiting)"
1601172214056767489,YUNUS HANBAL,HanbalYunus,cryptojack there are many innovations and surprises in the future project dxgm metaverse universe you can partic,21,,,Fri Dec 09 11:09:38 +0000 2022,positive,"List(cryptojack, there, are, many, innovations, and, surprises, in, the, future, project, dxgm, metaverse, universe, you, can, partic)","List(cryptojack, many, innovations, surprises, future, project, dxgm, metaverse, universe, partic)"
1601172226631311362,ASLI HANBAL,HanbalAsli,cryptothro gplex with blockchain technology in the gaming world with the unique metaverse world waiting to be d,152,,,Fri Dec 09 11:09:41 +0000 2022,positive,"List(cryptothro, gplex, with, blockchain, technology, in, the, gaming, world, with, the, unique, metaverse, world, waiting, to, be, d)","List(cryptothro, gplex, blockchain, technology, gaming, world, unique, metaverse, world, waiting, d)"
1601172266460454914,ASLI HANBAL,HanbalAsli,cryptoworld gplex with blockchain technology in the gaming world with the unique metaverse world waiting to,152,,,Fri Dec 09 11:09:51 +0000 2022,positive,"List(cryptoworld, gplex, with, blockchain, technology, in, the, gaming, world, with, the, unique, metaverse, world, waiting, to)","List(cryptoworld, gplex, blockchain, technology, gaming, world, unique, metaverse, world, waiting)"
1601172313017581568,ASLI HANBAL,HanbalAsli,pascualprincipe gplex with blockchain technology in the gaming world with the unique metaverse world waiting to,152,,,Fri Dec 09 11:10:02 +0000 2022,positive,"List(pascualprincipe, gplex, with, blockchain, technology, in, the, gaming, world, with, the, unique, metaverse, world, waiting, to)","List(pascualprincipe, gplex, blockchain, technology, gaming, world, unique, metaverse, world, waiting)"
1601172334810783744,YUNUS HANBAL,HanbalYunus,riccardogems there are many innovations and surprises in the future project dxgm metaverse universe you can part,21,,,Fri Dec 09 11:10:07 +0000 2022,positive,"List(riccardogems, there, are, many, innovations, and, surprises, in, the, future, project, dxgm, metaverse, universe, you, can, part)","List(riccardogems, many, innovations, surprises, future, project, dxgm, metaverse, universe, part)"
1601172340188254208,Space ☆ Bruce,spacebruce,the vf phoenix variable fighter was an prototype for the vf valkyrie it served in as a front line fighter,952,68000 HEART ON FIRE /🔞 please,,Fri Dec 09 11:10:08 +0000 2022,positive,"List(the, vf, phoenix, variable, fighter, was, an, prototype, for, the, vf, valkyrie, it, served, in, as, a, front, line, fighter)","List(vf, phoenix, variable, fighter, prototype, vf, valkyrie, served, front, line, fighter)"


Write the sentiment dataframe with tokens and filtered tweets to project bucket

In [0]:
df_sent_stopwords.write.parquet('/mnt/project_bucket/AI-sentiment-cleaned.parquet')

In [0]:
df_sent_stopwords.columns

Out[42]: ['id',
 'name',
 'screen_name',
 'tweet',
 'followers_count',
 'location',
 'geo',
 'created_at',
 'sentiment',
 'tokens',
 'filtered']

# Machine learning pipeline in Spark

Select features

In [0]:
features = ['tweet', 'sentiment']

In [0]:
# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms
assembler = VectorAssembler(inputCols=["1gram_idf"], outputCol="features")
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")
lr = LogisticRegression(maxIter=100)

pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, assembler, label_encoder, lr])

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))




Accuracy Score: 0.9227
ROC-AUC: 0.9592


In [0]:
predictions.columns

Out[57]: ['tweet',
 'sentiment',
 'tokens',
 'filtered',
 'cv',
 '1gram_idf',
 'features',
 'label',
 'rawPrediction',
 'probability',
 'prediction']

In [0]:
predictions.printSchema()

root
 |-- tweet: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cv: vector (nullable = true)
 |-- 1gram_idf: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
display(predictions)

tweet,sentiment,tokens,filtered,cv,1gram_idf,features,label,rawPrediction,probability,prediction
_lia prof stephen hawking once said that efforts to create thinking machines pose a threat to our very existenc,positive,"List(_lia, prof, stephen, hawking, once, said, that, efforts, to, create, thinking, machines, pose, a, threat, to, our, very, existenc)","List(_lia, prof, stephen, hawking, said, efforts, create, thinking, machines, pose, threat, existenc)","Map(vectorType -> sparse, length -> 9690, indices -> List(126, 174, 401, 737, 1402, 1664, 2349, 3206, 4294, 5164, 8687), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(126, 174, 401, 737, 1402, 1664, 2349, 3206, 4294, 5164, 8687), values -> List(4.284180915518542, 4.618930498828308, 5.37420946049526, 6.093332127458466, 6.735186013630861, 6.95832956494507, 7.246011637396851, 0.0, 0.0, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(126, 174, 401, 737, 1402, 1664, 2349), values -> List(4.284180915518542, 4.618930498828308, 5.37420946049526, 6.093332127458466, 6.735186013630861, 6.95832956494507, 7.246011637396851))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(9.989804860077871, -9.989804860077871))","Map(vectorType -> dense, length -> 2, values -> List(0.9999541369476241, 4.586305237586075E-5))",0.0
abhishekrajaram blockchain artificial intelligence ke baare karte he badi badi foreign jake inse simple audit k,negative,"List(abhishekrajaram, blockchain, artificial, intelligence, ke, baare, karte, he, badi, badi, foreign, jake, inse, simple, audit, k)","List(abhishekrajaram, blockchain, artificial, intelligence, ke, baare, karte, badi, badi, foreign, jake, inse, simple, audit, k)","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 19, 753, 1530, 3791), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 19, 753, 1530, 3791), values -> List(0.5021487797403018, 0.5727136696291976, 3.2788693327476257, 6.093332127458466, 6.840546529288687, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 19, 753, 1530), values -> List(0.5021487797403018, 0.5727136696291976, 3.2788693327476257, 6.093332127458466, 6.840546529288687))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(8.366132940520423, -8.366132940520423))","Map(vectorType -> dense, length -> 2, values -> List(0.9997674407500288, 2.3255924997123145E-4))",0.0
adameshelton do you know dexgame there are metaverse nft and gamefi products the esports platform called,positive,"List(adameshelton, do, you, know, dexgame, there, are, metaverse, nft, and, gamefi, products, the, esports, platform, called)","List(adameshelton, know, dexgame, metaverse, nft, gamefi, products, esports, platform, called)","Map(vectorType -> sparse, length -> 9690, indices -> List(13, 20, 45, 102, 130, 134, 139, 160, 172, 3853), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(13, 20, 45, 102, 130, 134, 139, 160, 172, 3853), values -> List(3.13513777322354, 3.3541913392862246, 3.5655004329534314, 4.033824800679447, 4.310383287912566, 4.3743320125128395, 4.432600920636815, 4.549134736892767, 4.606954307781593, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(13, 20, 45, 102, 130, 134, 139, 160, 172), values -> List(3.13513777322354, 3.3541913392862246, 3.5655004329534314, 4.033824800679447, 4.310383287912566, 4.3743320125128395, 4.432600920636815, 4.549134736892767, 4.606954307781593))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-11.323972485315224, 11.323972485315224))","Map(vectorType -> dense, length -> 2, values -> List(1.2079695592351348E-5, 0.9999879203044076))",1.0
ai art needs a lot of serious debate and discussion and this is just the beginning,negative,"List(ai, art, needs, a, lot, of, serious, debate, and, discussion, and, this, is, just, the, beginning)","List(ai, art, needs, lot, serious, debate, discussion, beginning)","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 5, 481, 794, 1778, 2262, 2558, 5141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 5, 481, 794, 1778, 2262, 2558, 5141), values -> List(1.4601372740221783, 2.506893478899021, 5.670475276638432, 6.147399348728742, 6.95832956494507, 7.246011637396851, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 5, 481, 794, 1778, 2262), values -> List(1.4601372740221783, 2.506893478899021, 5.670475276638432, 6.147399348728742, 6.95832956494507, 7.246011637396851))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(10.036201248867096, -10.036201248867096))","Map(vectorType -> dense, length -> 2, values -> List(0.9999562161282682, 4.378387173176179E-5))",0.0
ai in its human form,positive,"List(ai, in, its, human, form)","List(ai, human, form)","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 11, 309), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 11, 309), values -> List(1.4601372740221783, 3.0690635466395406, 5.209129710135811))","Map(vectorType -> sparse, length -> 9690, indices -> List(3, 11, 309), values -> List(1.4601372740221783, 3.0690635466395406, 5.209129710135811))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(3.7036394509280233, -3.7036394509280233))","Map(vectorType -> dense, length -> 2, values -> List(0.9759585208740148, 0.0240414791259852))",0.0
aiadvertising launches next generation investor website antonio business wire aiadver,positive,"List(aiadvertising, launches, next, generation, investor, website, antonio, business, wire, aiadver)","List(aiadvertising, launches, next, generation, investor, website, antonio, business, wire, aiadver)","Map(vectorType -> sparse, length -> 9690, indices -> List(135, 212, 510, 1232, 1890, 4178, 9640), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(135, 212, 510, 1232, 1890, 4178, 9640), values -> List(4.393380207483534, 4.833078487233941, 5.741934240620577, 6.639875833826536, 7.091860957569593, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(135, 212, 510, 1232, 1890), values -> List(4.393380207483534, 4.833078487233941, 5.741934240620577, 6.639875833826536, 7.091860957569593))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-35.011402735551854, 35.011402735551854))","Map(vectorType -> dense, length -> 2, values -> List(6.23362953067061E-16, 0.9999999999999993))",1.0
akron entrepreneur aims artificial intelligence at cnc machining crain s cleveland business akron entrepreneur a,negative,"List(akron, entrepreneur, aims, artificial, intelligence, at, cnc, machining, crain, s, cleveland, business, akron, entrepreneur, a)","List(akron, entrepreneur, aims, artificial, intelligence, cnc, machining, crain, cleveland, business, akron, entrepreneur)","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 212, 260, 772, 3086, 5713, 6265, 6408, 6889), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 212, 260, 772, 3086, 5713, 6265, 6408, 6889), values -> List(0.5021487797403018, 0.5727136696291976, 4.833078487233941, 9.989439677580712, 6.147399348728742, 0.0, 0.0, 0.0, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 212, 260, 772), values -> List(0.5021487797403018, 0.5727136696291976, 4.833078487233941, 9.989439677580712, 6.147399348728742))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(13.958812432086358, -13.958812432086358))","Map(vectorType -> dense, length -> 2, values -> List(0.9999991335082931, 8.66491706941197E-7))",0.0
an artificial intelligence ai artist from kerala named arun nura arun nura shares artificially generated artwo,negative,"List(an, artificial, intelligence, ai, artist, from, kerala, named, arun, nura, arun, nura, shares, artificially, generated, artwo)","List(artificial, intelligence, ai, artist, kerala, named, arun, nura, arun, nura, shares, artificially, generated, artwo)","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 3, 175, 585, 786, 2799, 2812, 4117, 4719, 6052), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 3, 175, 585, 786, 2799, 2812, 4117, 4719, 6052), values -> List(0.5021487797403018, 0.5727136696291976, 1.4601372740221783, 4.618930498828308, 5.859717276276961, 6.147399348728742, 0.0, 0.0, 0.0, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 3, 175, 585, 786), values -> List(0.5021487797403018, 0.5727136696291976, 1.4601372740221783, 4.618930498828308, 5.859717276276961, 6.147399348728742))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(-3.0160901708219394, 3.0160901708219394))","Map(vectorType -> dense, length -> 2, values -> List(0.04670424347791239, 0.9532957565220876))",1.0
art generating ais are trained on datasets of millions of images scraped from the internet but the artists who cre,positive,"List(art, generating, ais, are, trained, on, datasets, of, millions, of, images, scraped, from, the, internet, but, the, artists, who, cre)","List(art, generating, ais, trained, datasets, millions, images, scraped, internet, artists, cre)","Map(vectorType -> sparse, length -> 9690, indices -> List(5, 81, 93, 185, 477, 1183, 2376, 2557, 3109, 3185), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(5, 81, 93, 185, 477, 1183, 2376, 2557, 3109, 3185), values -> List(2.506893478899021, 4.125116220888854, 3.9815253012765983, 4.720282993088596, 5.705566596449702, 6.552864456836906, 7.246011637396851, 0.0, 0.0, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(5, 81, 93, 185, 477, 1183, 2376), values -> List(2.506893478899021, 4.125116220888854, 3.9815253012765983, 4.720282993088596, 5.705566596449702, 6.552864456836906, 7.246011637396851))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-24.16788361473494, 24.16788361473494))","Map(vectorType -> dense, length -> 2, values -> List(3.191695908113488E-11, 0.9999999999680831))",1.0
artificial intelligence a bald eagle perched on a fence buildinpublic,negative,"List(artificial, intelligence, a, bald, eagle, perched, on, a, fence, buildinpublic)","List(artificial, intelligence, bald, eagle, perched, fence, buildinpublic)","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 8495), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2, 8495), values -> List(0.5021487797403018, 0.5727136696291976, 0.0))","Map(vectorType -> sparse, length -> 9690, indices -> List(0, 2), values -> List(0.5021487797403018, 0.5727136696291976))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(9.937093565585156, -9.937093565585156))","Map(vectorType -> dense, length -> 2, values -> List(0.9999516547175681, 4.834528243191638E-5))",0.0


Save the prediction results to the project bucket

In [0]:
(predictions.write
 .parquet('/mnt/project_bucket/twitter_predictions.parquet')
)

# 2 NGrams - Logistic Regression

In [0]:
from pyspark.ml.feature import NGram, VectorAssembler, StopWordsRemover, HashingTF, IDF, Tokenizer, StringIndexer, NGram, ChiSqSelector, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# label
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms

# Add NGram feature
ngram = NGram(n=3, inputCol="filtered", outputCol="2gram")
ngram_hashingtf = HashingTF(inputCol="2gram", outputCol="2gram_tf", numFeatures=20000)
ngram_idf = IDF(inputCol='2gram_tf', outputCol="2gram_idf", minDocFreq=5)

# Assemble all text features - this will combine the features we want to use

assembler = VectorAssembler(inputCols=["1gram_idf", "2gram_tf"], outputCol="features")

# assembler = VectorAssembler(inputCols=["1gram_idf", "3gram_tf"], outputCol="rawFeatures")
# Chi-square variable selection
# selector = ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")

# Regression model estimator
lr = LogisticRegression(maxIter=100)

# Build the pipeline

# with Chi-square Selection
# pipeline = Pipeline(stages=[label_encoder, tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, selector, lr])

# without Chi-square Selection
pipeline = Pipeline(stages=[label_encoder, tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, lr])

# Pipeline model fitting
pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9187
ROC-AUC: 0.9557


# 3 NGram - Logistic Regression

In [0]:
# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# label
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms

# Add NGram feature
ngram = NGram(n=3, inputCol="filtered", outputCol="3gram")
ngram_hashingtf = HashingTF(inputCol="3gram", outputCol="3gram_tf", numFeatures=20000)
ngram_idf = IDF(inputCol='3gram_tf', outputCol="3gram_idf", minDocFreq=5)

# Assemble all text features - this will combine the features we want to use

assembler = VectorAssembler(inputCols=["1gram_idf", "3gram_tf"], outputCol="features")

# assembler = VectorAssembler(inputCols=["1gram_idf", "3gram_tf"], outputCol="rawFeatures")
# Chi-square variable selection
# selector = ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")

# Regression model estimator
lr = LogisticRegression(maxIter=100)

# Build the pipeline

# with Chi-square Selection
# pipeline = Pipeline(stages=[label_encoder, tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, selector, lr])

# without Chi-square Selection
pipeline = Pipeline(stages=[label_encoder, tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, lr])

# Pipeline model fitting
pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9187
ROC-AUC: 0.9557


# Try out some other models
- LinearSVC
- RandomForestClassifier
- GBTClassifier

In [0]:
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, GBTClassifier

## LinearSVC

In [0]:
# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms
assembler = VectorAssembler(inputCols=["1gram_idf"], outputCol="features")
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")
model = LinearSVC()

pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, assembler, label_encoder, model])

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))




Accuracy Score: 0.9579
ROC-AUC: 0.9873


# RandomForestClassifier

In [0]:
# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms
assembler = VectorAssembler(inputCols=["1gram_idf"], outputCol="features")
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")
model = RandomForestClassifier()

pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, assembler, label_encoder, model])

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8145
ROC-AUC: 0.9538


# GBTClassifier

In [0]:
# Use 80% cases for training, 20% cases for testing
train, test = df_sent[features].randomSplit([0.8, 0.2], seed=42)

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) #minDocFreq: remove sparse terms
assembler = VectorAssembler(inputCols=["1gram_idf"], outputCol="features")
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")
model = GBTClassifier()

pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, assembler, label_encoder, model])

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))


Accuracy Score: 0.9462
ROC-AUC: 0.9742


# Save Results

In [0]:
# Create my schema
model_results_schema = StructType([
    StructField('model_name', StringType(), True),
    StructField('accuracy', FloatType(), True),
    StructField('roc_auc', FloatType(), True),
    StructField('run_time', FloatType(), True)
])

model_results_df = spark.createDataFrame([], model_results_schema)

In [0]:
model_results_df.columns

Out[83]: ['model_name', 'accuracy', 'roc_auc', 'run_time']

In [0]:
lr_results = spark.createDataFrame([('logistic regression', 0.9227, 0.9592, 9.90)], model_results_schema)
lr2n_results = spark.createDataFrame([('logistic regression 2nGram', 0.9187, 0.9557, 11.07)], model_results_schema)
lr3n_results = spark.createDataFrame([('logistic regression 3nGram', 0.9187, 0.9557, 11.35)], model_results_schema)
lsvc_results = spark.createDataFrame([('linear svc', 0.9579, 0.9873, 19.25)], model_results_schema)
rfc_results = spark.createDataFrame([('random forest classifier', 0.8145, 0.9538, 1.56)], model_results_schema)
gbtc_results = spark.createDataFrame([('GBTClassifier', 0.9462, 0.9742, 10.53)], model_results_schema)

In [0]:
model_results_df = model_results_df.union(lr_results)
model_results_df = model_results_df.union(lr2n_results)
model_results_df = model_results_df.union(lr3n_results)
model_results_df = model_results_df.union(lsvc_results)
model_results_df = model_results_df.union(rfc_results)
model_results_df = model_results_df.union(gbtc_results)

In [0]:
model_results_df.show()

+--------------------+--------+-------+--------+
|          model_name|accuracy|roc_auc|run_time|
+--------------------+--------+-------+--------+
| logistic regression|  0.9227| 0.9592|     9.9|
|logistic regressi...|  0.9187| 0.9557|   11.07|
|logistic regressi...|  0.9187| 0.9557|   11.35|
|          linear svc|  0.9579| 0.9873|   19.25|
|random forest cla...|  0.8145| 0.9538|    1.56|
|       GBTClassifier|  0.9462| 0.9742|   10.53|
+--------------------+--------+-------+--------+



Write model results to project bucket

In [0]:
(model_results_df
 .write
 .option('header', 'false')
 .option('delimiter', '\t')
 .csv('/mnt/project_bucket/model_results.csv')
)