In [72]:
# import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline, PipelineModel
import numpy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u1105800\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
spark = SparkSession.builder.master("local[1]")\
          .appName("SparkByExamples.com")\
          .getOrCreate()
df = spark.read.csv("C:/Users/u1105800/PG/Capstone/Capstone/capstone/Capstone/sample_dataset.csv",header=True)
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Summary: string (nullable = true)



In [30]:
df.show(5)

+--------+--------------------+
|Category|             Summary|
+--------+--------------------+
|Business|Reuters - Short-s...|
|Business|Reuters - Private...|
|Business|Reuters - Soaring...|
|Business|Reuters - Authori...|
|Business|AFP - Tearaway wo...|
+--------+--------------------+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import col
df.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|            Category|count|
+--------------------+-----+
|  Science/Technology|35334|
|            Business|33992|
|              Sports|32757|
|               World|31900|
|       Entertainment| 2583|
|            Politics| 2377|
|This story has be...|  125|
|             However|   82|
|           Meanwhile|   27|
|             Further|   18|
|             In 2014|   17|
|    So far this year|   16|
|                Also|   15|
|          To be sure|   15|
|             However|   14|
|               Still|   14|
|             Besides|   13|
|                 Now|   12|
|           On Monday|   11|
|           At 9.15am|   10|
+--------------------+-----+
only showing top 20 rows



In [55]:
df.filter((df.Category == 'Business') | (df.Category == 'Sports') | (df.Category == 'Entertainment')|(df.Category == 'Politics')|(df.Category == 'Science/Technology')).count()

107043

In [62]:
print(df.count())
df = df.dropna()
print(df.count())

152599
149881


In [73]:
df = df['Summary','Category']

stopwords_ = stopwords.words('english')
tokenizer = Tokenizer(inputCol="Summary", outputCol="words")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_data").setStopWords(stopwords_)
hashingTF = HashingTF(inputCol="filtered_data", outputCol="tf", numFeatures=10000)
idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=5)
labelAnnotator = StringIndexer(inputCol = "Category", outputCol = "label")
preprocessorPipeline = Pipeline(stages=[tokenizer, stopwordsRemover, hashingTF, idf, labelAnnotator])
preprocessorPipelineFit = preprocessorPipeline.fit(df)

#preprocessorPipelineFit.save('preprocessor')

#preprocessor = PipelineModel.load("preprocessor")

#cleaned_df = preprocessorPipelineFit.transform(df)
cleaned_df = preprocessorPipelineFit.transform(df)

print('Data cleansing done!!')

print('Columns after data cleansing: ',cleaned_df.columns)

cleaned_df = cleaned_df['Summary', 'Category', 'tf', 'idf', 'label']

print('Schema after data cleansing')
cleaned_df.printSchema()

Data cleansing done!!
Columns after data cleansing:  ['Summary', 'Category', 'words', 'filtered_data', 'tf', 'idf', 'label']
Schema after data cleansing
root
 |-- Summary: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- tf: vector (nullable = true)
 |-- idf: vector (nullable = true)
 |-- label: double (nullable = false)



In [74]:
cleaned_df.show(3)

+--------------------+--------+--------------------+--------------------+-----+
|             Summary|Category|                  tf|                 idf|label|
+--------------------+--------+--------------------+--------------------+-----+
|Reuters - Short-s...|Business|(10000,[551,1152,...|(10000,[551,1152,...|  1.0|
|Reuters - Private...|Business|(10000,[1152,1562...|(10000,[1152,1562...|  1.0|
|Reuters - Soaring...|Business|(10000,[217,532,7...|(10000,[217,532,7...|  1.0|
+--------------------+--------+--------------------+--------------------+-----+
only showing top 3 rows

