In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('SpamDetector').getOrCreate()

In [23]:
data = spark.read.option('header', 'true').csv('spam.csv', inferSchema=True)
sms = data.select(data.columns[:2])

In [24]:
sms.printSchema()

root
 |-- v1: string (nullable = true)
 |-- v2: string (nullable = true)



In [25]:
sms.show(3)

+----+--------------------+
|  v1|                  v2|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
+----+--------------------+
only showing top 3 rows



In [26]:
sms = sms.withColumnRenamed("v1","label").withColumnRenamed("v2","text")
sms.show(2)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
+-----+--------------------+
only showing top 2 rows



In [36]:
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(1, truncate=True)

+-----+--------------------+--------------------+
|label|                text|               words|
+-----+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|
+-----+--------------------+--------------------+
only showing top 1 row



In [37]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms').transform(wrangled)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024).transform(wrangled)

In [38]:
wrangled.show(3)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|                text|               words|               terms|                hash|
+-----+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(1024,[12,171,191...|
|  ham|Ok lar Joking wif...|[ok, lar, joking,...|[ok, lar, joking,...|(1024,[3,493,565,...|
| spam|Free entry in a w...|[free, entry, in,...|[free, entry, wkl...|(1024,[16,24,35,5...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [41]:
type(wrangled.hash)

pyspark.sql.column.Column

In [40]:
# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features').fit(wrangled).transform(wrangled)
      
#tf_idf.select('terms', 'features').show(4, truncate=False)

Py4JJavaError: An error occurred while calling o468.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31.0 failed 1 times, most recent failure: Lost task 0.0 in stage 31.0 (TID 31) (172.20.10.30 executor driver): org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$2941/0x00000008018a1ba8: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.lang.NullPointerException: Cannot invoke "String.toLowerCase()" because "x$1" is null
	at org.apache.spark.ml.feature.Tokenizer.$anonfun$createTransformFunc$1(Tokenizer.scala:40)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2309)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.mllib.feature.IDF.fit(IDF.scala:55)
	at org.apache.spark.ml.feature.IDF.fit(IDF.scala:93)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (Tokenizer$$Lambda$2941/0x00000008018a1ba8: (string) => array<string>)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	... 1 more
Caused by: java.lang.NullPointerException
	at org.apache.spark.ml.feature.Tokenizer.$anonfun$createTransformFunc$1(Tokenizer.scala:40)
	... 30 more


In [None]:
# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)                      

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

# Accuracy=(TN+TP)/(TN+TP+FN+FP)—proportionofcorrectpredictions.

# Pipeline

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol='words', outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol="terms", outputCol="hash")
idf = IDF(inputCol="hash", outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

## regex and NLTK examples

In [1]:
tweets = ['This is the best #nlp exercise ive found online! #python',
 '#NLP is super fun! <3 #learning',
 'Thanks @datacamp :) #nlp #python']

In [2]:
# Tokenizing list of Tweets:
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"
# Use the pattern on the first tweet in the tweets list
hashtags = regexp_tokenize(tweets[0], pattern1)
print(hashtags)

['#nlp', '#python']


In [8]:
hashtags = regexp_tokenize(tweets[2], pattern1)
print(hashtags)

['#nlp', '#python']


In [10]:
#Finding `mentions` symbol in the tweets 
#source: https://stackoverflow.com/questions/7150652/regex-valid-twitter-mention
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"([@#]\w+)"
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)
print(mentions_hashtags)

['@datacamp', '#nlp', '#python']


In [11]:
# Using list comprehension to tokenize the entire list of tweets, 
# instanciate an object for this:
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

[['This', 'is', 'the', 'best', '#nlp', 'exercise', 'ive', 'found', 'online', '!', '#python'], ['#NLP', 'is', 'super', 'fun', '!', '<3', '#learning'], ['Thanks', '@datacamp', ':)', '#nlp', '#python']]


In [12]:
# Text preprocessing:
lower_tokens=["'",
 "''",
 'debugging',
 "''",
 "'",
 'is',
 'the',
 'process',
 'of',
 'finding',
 'and',
 'resolving',
 'of',
 'defects',
 'that',
 'prevent',
 'correct',
 'operation',
 'of',
 'computer',
 'software',
 'or',
 'a',
 'system',
 '.',
 'numerous',
 'books',
 'have',
 'been',
 'written',
 'about',
 'debugging',
 '(',
 'see',
 'below',
 ':',
 '#',
 'further',
 'reading|further',
 'reading',
 ')',
 ',',
 'as',
 'it',
 'involves',
 'numerous',
 'aspects',
 ',',
 'including',
 'interactive',
 'debugging',
 ',',
 'control',
 'flow',
 ',',
 'integration',
 'testing',
 ',',
 'logfile|log',
 'files',
 ',',
 'monitoring',
 '(',
 'application',
 'monitoring|application',
 ',',
 'system',
 'monitoring|system',
 ')',
 ',',
 'memory',
 'dumps',
 ',',
 'profiling',
 '(',
 'computer',
 'programming',
 ')',
 '|profiling',
 ',',
 'statistical',
 'process',
 'control',
 ',',
 'and',
 'special',
 'design',
 'tactics',
 'to',
 'improve',
 'detection',
 'while',
 'simplifying',
 'changes',
 '.',
 'origin',
 'a',
 'computer',
 'log',
 'entry',
 'from',
 'the',
 'mark',
 '&',
 'nbsp',
 ';',
 'ii',
 ',',
 'with',
 'a',
 'moth',
 'taped',
 'to',
 'the',
 'page',
 'the',
 'terms',
 '``',
 'bug',
 "''",
 'and',
 '``',
 'debugging',
 "''",
 'are',
 'popularly',
 'attributed',
 'to',
 'admiral',
 'grace',
 'hopper',
 'in',
 'the',
 '1940s',
 '.',
 '[',
 'http',
 ':',
 '//foldoc.org/grace+hopper',
 'grace',
 'hopper',
 ']',
 'from',
 'foldoc',
 'while',
 'she',
 'was',
 'working',
 'on',
 'a',
 'harvard',
 'mark',
 'ii|mark',
 'ii',
 'computer',
 'at',
 'harvard',
 'university',
 ',',
 'her',
 'associates',
 'discovered',
 'a',
 'moth',
 'stuck',
 'in',
 'a',
 'relay',
 'and',
 'thereby',
 'impeding',
 'operation',
 ',',
 'whereupon',
 'she',
 'remarked',
 'that',
 'they',
 'were',
 '``',
 'debugging',
 "''",
 'the',
 'system',
 '.',
 'however',
 'the',
 'term',
 '``',
 'bug',
 "''",
 'in',
 'the',
 'meaning',
 'of',
 'technical',
 'error',
 'dates',
 'back',
 'at',
 'least',
 'to',
 '1878',
 'and',
 'thomas',
 'edison',
 '(',
 'see',
 'software',
 'bug',
 'for',
 'a',
 'full',
 'discussion',
 ')',
 ',',
 'and',
 '``',
 'debugging',
 "''",
 'seems',
 'to',
 'have',
 'been',
 'used',
 'as',
 'a',
 'term',
 'in',
 'aeronautics',
 'before',
 'entering',
 'the',
 'world',
 'of',
 'computers',
 '.',
 'indeed',
 ',',
 'in',
 'an',
 'interview',
 'grace',
 'hopper',
 'remarked',
 'that',
 'she',
 'was',
 'not',
 'coining',
 'the',
 'term',
 '{',
 '{',
 'citation',
 'needed|date=july',
 '2015',
 '}',
 '}',
 '.',
 'the',
 'moth',
 'fit',
 'the',
 'already',
 'existing',
 'terminology',
 ',',
 'so',
 'it',
 'was',
 'saved',
 '.',
 'a',
 'letter',
 'from',
 'j.',
 'robert',
 'oppenheimer',
 '(',
 'director',
 'of',
 'the',
 'wwii',
 'atomic',
 'bomb',
 '``',
 'manhattan',
 "''",
 'project',
 'at',
 'los',
 'alamos',
 ',',
 'nm',
 ')',
 'used',
 'the',
 'term',
 'in',
 'a',
 'letter',
 'to',
 'dr.',
 'ernest',
 'lawrence',
 'at',
 'uc',
 'berkeley',
 ',',
 'dated',
 'october',
 '27',
 ',',
 '1944',
 ',',
 'http',
 ':',
 '//bancroft.berkeley.edu/exhibits/physics/images/bigscience25.jpg',
 'regarding',
 'the',
 'recruitment',
 'of',
 'additional',
 'technical',
 'staff',
 '.',
 'the',
 'oxford',
 'english',
 'dictionary',
 'entry',
 'for',
 '``',
 'debug',
 "''",
 'quotes',
 'the',
 'term',
 '``',
 'debugging',
 "''",
 'used',
 'in',
 'reference',
 'to',
 'airplane',
 'engine',
 'testing',
 'in',
 'a',
 '1945',
 'article',
 'in',
 'the',
 'journal',
 'of',
 'the',
 'royal',
 'aeronautical',
 'society',
 '.',
 'an',
 'article',
 'in',
 '``',
 'airforce',
 "''",
 '(',
 'june',
 '1945',
 'p.',
 '&',
 'nbsp',
 ';',
 '50',
 ')',
 'also',
 'refers',
 'to',
 'debugging',
 ',',
 'this',
 'time',
 'of',
 'aircraft',
 'cameras',
 '.',
 'hopper',
 "'s",
 'computer',
 'bug|bug',
 'was',
 'found',
 'on',
 'september',
 '9',
 ',',
 '1947',
 '.',
 'the',
 'term',
 'was',
 'not',
 'adopted',
 'by',
 'computer',
 'programmers',
 'until',
 'the',
 'early',
 '1950s',
 '.',
 'the',
 'seminal',
 'article',
 'by',
 'gills',
 '.',
 'gill',
 ',',
 '[',
 'http',
 ':',
 '//www.jstor.org/stable/98663',
 'the',
 'diagnosis',
 'of',
 'mistakes',
 'in',
 'programmes',
 'on',
 'the',
 'edsac',
 ']',
 ',',
 'proceedings',
 'of',
 'the',
 'royal',
 'society',
 'of',
 'london',
 '.',
 'series',
 'a',
 ',',
 'mathematical',
 'and',
 'physical',
 'sciences',
 ',',
 'vol',
 '.',
 '206',
 ',',
 'no',
 '.',
 '1087',
 '(',
 'may',
 '22',
 ',',
 '1951',
 ')',
 ',',
 'pp',
 '.',
 '538-554',
 'in',
 '1951',
 'is',
 'the',
 'earliest',
 'in-depth',
 'discussion',
 'of',
 'programming',
 'errors',
 ',',
 'but',
 'it',
 'does',
 'not',
 'use',
 'the',
 'term',
 '``',
 'bug',
 "''",
 'or',
 '``',
 'debugging',
 "''",
 '.',
 'in',
 'the',
 'association',
 'for',
 'computing',
 'machinery|acm',
 "'s",
 'digital',
 'library',
 ',',
 'the',
 'term',
 '``',
 'debugging',
 "''",
 'is',
 'first',
 'used',
 'in',
 'three',
 'papers',
 'from',
 '1952',
 'acm',
 'national',
 'meetings.robert',
 'v.',
 'd.',
 'campbell',
 ',',
 '[',
 'http',
 ':',
 '//portal.acm.org/citation.cfm',
 '?',
 'id=609784.609786',
 'evolution',
 'of',
 'automatic',
 'computation',
 ']',
 ',',
 'proceedings',
 'of',
 'the',
 '1952',
 'acm',
 'national',
 'meeting',
 '(',
 'pittsburgh',
 ')',
 ',',
 'p',
 '29-32',
 ',',
 '1952.alex',
 'orden',
 ',',
 '[',
 'http',
 ':',
 '//portal.acm.org/citation.cfm',
 '?',
 'id=609784.609793',
 'solution',
 'of',
 'systems',
 'of',
 'linear',
 'inequalities',
 'on',
 'a',
 'digital',
 'computer',
 ']',
 ',',
 'proceedings',
 'of',
 'the',
 '1952',
 'acm',
 'national',
 'meeting',
 '(',
 'pittsburgh',
 ')',
 ',',
 'p.',
 '91-95',
 ',',
 '1952.howard',
 'b.',
 'demuth',
 ',',
 'john',
 'b.',
 'jackson',
 ',',
 'edmund',
 'klein',
 ',',
 'n.',
 'metropolis',
 ',',
 'walter',
 'orvedahl',
 ',',
 'james',
 'h.',
 'richardson',
 ',',
 '[',
 'http',
 ':',
 '//portal.acm.org/citation.cfm',
 '?',
 'id=800259.808982',
 'maniac',
 ']',
 ',',
 'proceedings',
 'of',
 'the',
 '1952',
 'acm',
 'national',
 'meeting',
 '(',
 'toronto',
 ')',
 ',',
 'p.',
 '13-16',
 'two',
 'of',
 'the',
 'three',
 'use',
 'the',
 'term',
 'in',
 'quotation',
 'marks',
 '.',
 'by',
 '1963',
 '``',
 'debugging',
 "''",
 'was',
 'a',
 'common',
 'enough',
 'term',
 'to',
 'be',
 'mentioned',
 'in',
 'passing',
 'without',
 'explanation',
 'on',
 'page',
 '1',
 'of',
 'the',
 'compatible',
 'time-sharing',
 'system|ctss',
 'manual',
 '.',
 '[',
 'http',
 ':',
 '//www.bitsavers.org/pdf/mit/ctss/ctss_programmersguide.pdf',
 'the',
 'compatible',
 'time-sharing',
 'system',
 ']',
 ',',
 'm.i.t',
 '.',
 'press',
 ',',
 '1963',
 'kidwell',
 "'s",
 'article',
 "''stalking",
 'the',
 'elusive',
 'computer',
 'bug',
 "''",
 'peggy',
 'aldrich',
 'kidwell',
 ',',
 '[',
 'http',
 ':',
 '//ieeexplore.ieee.org/xpl/freeabs_all.jsp',
 '?',
 'tp=',
 '&',
 'arnumber=728224',
 '&',
 'isnumber=15706',
 'stalking',
 'the',
 'elusive',
 'computer',
 'bug',
 ']',
 ',',
 'ieee',
 'annals',
 'of',
 'the',
 'history',
 'of',
 'computing',
 ',',
 '1998.',
 'discusses',
 'the',
 'etymology',
 'of',
 '``',
 'bug',
 "''",
 'and',
 '``',
 'debug',
 "''",
 'in',
 'greater',
 'detail',
 '.',
 'scope',
 'as',
 'software',
 'and',
 'electronic',
 'systems',
 'have',
 'become',
 'generally',
 'more',
 'complex',
 ',',
 'the',
 'various',
 'common',
 'debugging',
 'techniques',
 'have',
 'expanded',
 'with',
 'more',
 'methods',
 'to',
 'detect',
 'anomalies',
 ',',
 'assess',
 'impact',
 ',',
 'and',
 'schedule',
 'software',
 'patches',
 'or',
 'full',
 'updates',
 'to',
 'a',
 'system',
 '.',
 'the',
 'words',
 '``',
 'anomaly',
 "''",
 'and',
 '``',
 'discrepancy',
 "''",
 'can',
 'be',
 'used',
 ',',
 'as',
 'being',
 'more',
 'neutral',
 'terms',
 ',',
 'to',
 'avoid',
 'the',
 'words',
 '``',
 'error',
 "''",
 'and',
 '``',
 'defect',
 "''",
 'or',
 '``',
 'bug',
 "''",
 'where',
 'there',
 'might',
 'be',
 'an',
 'implication',
 'that',
 'all',
 'so-called',
 "''errors",
 "''",
 ',',
 "''defects",
 "''",
 'or',
 "''bugs",
 "''",
 'must',
 'be',
 'fixed',
 '(',
 'at',
 'all',
 'costs',
 ')',
 '.',
 'instead',
 ',',
 'an',
 'impact',
 'assessment',
 'can',
 'be',
 'made',
 'to',
 'determine',
 'if',
 'changes',
 'to',
 'remove',
 'an',
 "''anomaly",
 "''",
 '(',
 'or',
 "''discrepancy",
 "''",
 ')',
 'would',
 'be',
 'cost-effective',
 'for',
 'the',
 'system',
 ',',
 'or',
 'perhaps',
 'a',
 'scheduled',
 'new',
 'release',
 'might',
 'render',
 'the',
 'change',
 '(',
 's',
 ')',
 'unnecessary',
 '.',
 'not',
 'all',
 'issues',
 'are',
 'life-critical',
 'or',
 'mission-critical',
 'in',
 'a',
 'system',
 '.',
 'also',
 ',',
 'it',
 'is',
 'important',
 'to',
 'avoid',
 'the',
 'situation',
 'where',
 'a',
 'change',
 'might',
 'be',
 'more',
 'upsetting',
 'to',
 'users',
 ',',
 'long-term',
 ',',
 'than',
 'living',
 'with',
 'the',
 'known',
 'problem',
 '(',
 's',
 ')',
 '(',
 'where',
 'the',
 '``',
 'cure',
 'would',
 'be',
 'worse',
 'than',
 'the',
 'disease',
 "''",
 ')',
 '.',
 'basing',
 'decisions',
 'of',
 'the',
 'acceptability',
 'of',
 'some',
 'anomalies',
 'can',
 'avoid',
 'a',
 'culture',
 'of',
 'a',
 '``',
 'zero-defects',
 "''",
 'mandate',
 ',',
 'where',
 'people',
 'might',
 'be',
 'tempted',
 'to',
 'deny',
 'the',
 'existence',
 'of',
 'problems',
 'so',
 'that',
 'the',
 'result',
 'would',
 'appear',
 'as',
 'zero',
 "''defects",
 "''",
 '.',
 'considering',
 'the',
 'collateral',
 'issues',
 ',',
 'such',
 'as',
 'the',
 'cost-versus-benefit',
 'impact',
 'assessment',
 ',',
 'then',
 'broader',
 'debugging',
 'techniques',
 'will',
 'expand',
 'to',
 'determine',
 'the',
 'frequency',
 'of',
 'anomalies']

In [18]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dmitrymikhaylov/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dmitrymikhaylov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from collections import Counter
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
english_stops = set(stopwords.words('english'))
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[('debugging', 13), ('term', 12), ('computer', 10), ('system', 9), ('bug', 8), ('http', 8), ('used', 5), ('software', 4), ('change', 4), ('hopper', 4)]


In [22]:
russian_stops = set(stopwords.words('russian'))
len(russian_stops)
len(english_stops)

179

## Creating and querying a corpus with `gensim`