In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, IndexToString, RegexTokenizer,\
    StopWordsRemover, Word2Vec, CountVectorizer, IDF
from pyspark.ml import Pipeline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'nlp'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
# get the data
fil = '../data/kickstarter.csv'
schem = StructType([StructField('', IntegerType()), StructField('blurb', StringType()), StructField('state', StringType())])
kick = spark.read.format('csv').options(header=True).schema(schem).load(fil).withColumnRenamed('', 'id')

# talk
cnt = kick.count()
print('%d records'%cnt)
display(kick.limit(10).toPandas())

223627 records


Unnamed: 0,id,blurb,state
0,1,"Using their own character, users go on educati...",failed
1,2,"MicroFly is a quadcopter packed with WiFi, 6 s...",successful
2,3,"A small indie press, run as a collective for a...",failed
3,4,Zylor is a new baby cosplayer! Back this kicks...,failed
4,5,Hatoful Boyfriend meet Skeletons! A comedy Dat...,failed
5,6,FastMan is a Infinite running platformer. Go i...,failed
6,7,FADE. A dark and somber RPG about survival and...,failed
7,8,The next generation of space combat with onlin...,failed
8,9,Whip around planets and smash your way to vict...,failed
9,10,"Sneak in, find treasures, avoid cats and colle...",failed


In [4]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:kick.select(colm).where(col(colm).isNull()).count() for colm in kick.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in kick.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
kick = kick.dropna(how='any')

# talk some more
print('%d records'%kick.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,state,13157.0,0.058835,StringType
1,id,8110.0,0.036266,IntegerType
2,blurb,1488.0,0.006654,StringType


209018 records


In [5]:
''' ensure state is only failed or successful '''
# count by state
kick.groupBy('state').count().orderBy(col('count').desc()).show()

# remove the bad rows
kick = kick.where(col('state').isin('failed', 'successful'))

# talk
print('%d records'%kick.count())

+--------------------+------+
|               state| count|
+--------------------+------+
|          successful|103192|
|              failed|101495|
| and get some col...|     8|
|          ","failed"|     6|
|     their childhood|     6|
| about a lonely f...|     5|
|                love|     5|
|              poetry|     4|
|            mastered|     4|
| solid surface on...|     3|
| She Wrote"" but ...|     3|
|                loss|     3|
|              2015."|     3|
|             romance|     3|
|              racism|     3|
|               music|     3|
| ""Tomorrow Comes...|     3|
|              2011."|     3|
|                NY."|     3|
|              2014."|     3|
+--------------------+------+
only showing top 20 rows

204687 records


In [6]:
''' get rid of
non-alphanumeric or whitespace chars
drop numbers if there is a space after
drop # if there is a space after
get rid of multiplied spaces
'''
kick = kick.select('id', 'state', regexp_replace(col('blurb'), '[^A-Za-z0-9\# ]', ' ').alias('blurb'))\
    .withColumn('blurb', regexp_replace(col('blurb'), '[0-9] ', ' '))\
    .withColumn('blurb', regexp_replace(col('blurb'), '\# ', ' '))\
    .withColumn('blurb', regexp_replace(col('blurb'), ' +', ' '))

# talk
kick.show(truncate=False)

+---+----------+-------------------------------------------------------------------------------------------------------------------------------------+
|id |state     |blurb                                                                                                                                |
+---+----------+-------------------------------------------------------------------------------------------------------------------------------------+
|1  |failed    |Using their own character users go on educational quests around a virtual world leveling up subject oriented skills ie Physics       |
|2  |successful|MicroFly is a quadcopter packed with WiFi sensors and processors for ultimate stability and fits in the palm of your hand            |
|3  |failed    |A small indie press run as a collective for authors who want to self publish and a sexy smart hilarious novel                        |
|4  |failed    |Zylor is a new baby cosplayer Back this kickstarter to help fund new cosplay p

In [7]:
# keep a copy of kick before any of this feature engineering
kick_orig = kick.select('*')

In [13]:
''' tokenize! '''
# split on whitespace & make it all lowercase - using space instead of '\\W' as the pattern to keep hashtags
toker = RegexTokenizer(inputCol='blurb', outputCol='words', pattern=' ', toLowercase=True)
kick = toker.transform(kick).select('id', 'state', 'words')

# talk
kick.show(truncate=False)

+---+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |state     |words                                                                                                                                                          |
+---+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1  |failed    |[using, their, own, character, users, go, on, educational, quests, around, a, virtual, world, leveling, up, subject, oriented, skills, ie, physics]            |
|2  |successful|[microfly, is, a, quadcopter, packed, with, wifi, sensors, and, processors, for, ultimate, stability, and, fits, in, the, palm, of, your, hand]                |
|3  |failed    |[a, small, indie, press, run, as, a, collective, for, authors, who, want, to, self, publish, and, a

In [14]:
''' remove stop words '''
# remove
stop = StopWordsRemover(inputCol='words', outputCol='fewer_words')
kick = stop.transform(kick)

# talk
print('Removed = %s'%stop.getStopWords())
kick.show(truncate=False)

Removed = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'sh

In [15]:
''' prepare and assess labels '''
# make the label column
indxr = StringIndexer(inputCol='state', outputCol='label', stringOrderType='frequencyAsc')
kick = indxr.fit(kick).transform(kick)

# check balance
kick.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|101495|
|  1.0|103192|
+-----+------+



## Now, instead of all this piecemeal stuff, how about using a pipeline for some of it?
If something in a pipeline needs a fit, that should go at the end.

In [17]:
''' all feature engineering stuff '''
# tokenizer
toker = RegexTokenizer(inputCol='blurb', outputCol='words', pattern=' ', toLowercase=True)
# stopper
stop = StopWordsRemover(inputCol=toker.getOutputCol(), outputCol='fewer_words')
# label indexer
indxr = StringIndexer(inputCol='state', outputCol='label', stringOrderType='frequencyAsc')

# pipeline
featEngine = Pipeline(stages=[toker, stop, indxr]).fit(kick_orig)
kickML = featEngine.transform(kick_orig).select('id', 'state', 'label', 'fewer_words')

# talk
kickML.show(truncate=False)
# check balance
kickML.groupBy('label').count().show()

+---+----------+-----+----------------------------------------------------------------------------------------------------------------------------+
|id |state     |label|fewer_words                                                                                                                 |
+---+----------+-----+----------------------------------------------------------------------------------------------------------------------------+
|1  |failed    |0.0  |[using, character, users, go, educational, quests, around, virtual, world, leveling, subject, oriented, skills, ie, physics]|
|2  |successful|1.0  |[microfly, quadcopter, packed, wifi, sensors, processors, ultimate, stability, fits, palm, hand]                            |
|3  |failed    |0.0  |[small, indie, press, run, collective, authors, want, self, publish, sexy, smart, hilarious, novel]                         |
|4  |failed    |0.0  |[zylor, new, baby, cosplayer, back, kickstarter, help, fund, new, cosplay, photoshoots, sh

In [None]:
sc.stop()