In [52]:
import findspark
findspark.init('/home/spark/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [53]:
spark =SparkSession.builder.appName('NlpProj').getOrCreate()

In [54]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer

In [55]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [56]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

In [57]:
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [58]:
tockenizer = Tokenizer(inputCol = 'sentence',outputCol = 'words')

In [59]:
regex_tockenizer = RegexTokenizer(inputCol = 'sentence',outputCol = 'words' ,  pattern = "\\W")

In [60]:
count_tockens  = udf(lambda words:len(words),IntegerType())

In [61]:
tockenized = tockenizer.transform(sentenceDataFrame)

In [62]:
tockenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [63]:
tockenized.withColumn('tockens',count_tockens(col('words'))).show()

+---+--------------------+--------------------+-------+
| id|            sentence|               words|tockens|
+---+--------------------+--------------------+-------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|      5|
|  1|I wish Java could...|[i, wish, java, c...|      7|
|  2|Logistic,regressi...|[logistic,regress...|      1|
+---+--------------------+--------------------+-------+



In [66]:
rg_tockenized = regex_tockenizer.transform(sentenceDataFrame)

In [67]:
rg_tockenized.withColumn('tockens',count_tockens(col('words'))).show()

+---+--------------------+--------------------+-------+
| id|            sentence|               words|tockens|
+---+--------------------+--------------------+-------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|      5|
|  1|I wish Java could...|[i, wish, java, c...|      7|
|  2|Logistic,regressi...|[logistic, regres...|      5|
+---+--------------------+--------------------+-------+



In [68]:
from pyspark.ml.feature import StopWordsRemover

In [69]:
SentenceDataframewTwo = spark.createDataFrame(

[
    (0 , ['I','saw','the','green','horse']),
    (1,['Marry','had','a','little','lamb'])
    
],['id','tockens']

)

In [71]:
SentenceDataframewTwo.show()

+---+--------------------+
| id|             tockens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Marry, had, a, l...|
+---+--------------------+



In [73]:
remover = StopWordsRemover(inputCol = 'tockens',outputCol = 'filtered')

In [75]:
remover.transform(SentenceDataframewTwo).show()

+---+--------------------+--------------------+
| id|             tockens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Marry, had, a, l...|[Marry, little, l...|
+---+--------------------+--------------------+



In [76]:
from pyspark.ml.feature import NGram

In [78]:

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

In [79]:
ngram = NGram(n = 2,inputCol = 'words', outputCol = 'grams')

In [80]:
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi, I, heard, ab...|[Hi I, I heard, h...|
|  1|[I, wish, Java, c...|[I wish, wish Jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+

