# 1. Sai krishna kanneti                        2. Yashas J Shamraju

In [1]:
import pyspark
import pickle
sc=pyspark.SparkContext().getOrCreate()
sqlContext=pyspark.SQLContext(sc)

In [117]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover,OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [105]:
import pandas as pd
df8 = pd.read_csv('data/lab3data.csv',encoding="latin-1")
df1 = pd.read_csv('output.csv',encoding="latin-1")
df20 = pd.read_csv('data/data_article.csv',encoding="latin-1")
# Prepare training documents, which are labeled.

df1.columns = ['label', 'text']
df8.columns = ['label', 'text']
df20.columns = ['label', 'text']
#print(df1.columns, df8.columns, df20.columns)
df = df1.append([df8, df20])

In [142]:
training = sqlContext.createDataFrame(df, ["labeli","text"])
train, test = training.randomSplit([0.65, 0.35], seed=12345)

# Feature Engineering

In [119]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
label_stringIdx = StringIndexer(inputCol = "labeli", outputCol = "label")
lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, label_stringIdx, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Building differnet Pipelines using IDF, HashingTF Separately to check for accuracy

In [158]:
'''
THE CODE IS COMMENTED AS IT IS NO LONGER USED IN THE PROCESS
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, IDF and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
idf=IDF(inputCol=hashingTF.getOutputCol(),outputCol='features')
label_stringIdx = StringIndexer(inputCol = "labeli", outputCol = "label")
lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, label_stringIdx, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice
'''

###### Accuracy using HashingTF is more as compared to IDF.(Test Accuracy using HashingTF is 79.26% where as using IDF is 65.62%).
so only HashingTF pipeline is further used in all the models and only retained in the downstream pipeline

# Multi-class Classification using Logistic Regression and crossvalidation

In [140]:
cvModel1 = crossval.fit(train)

Crossvalidation model using top 10 words, top 100 words and top 1000 words, different crossfold values and different regularization parameter values: 0.1, 0.01

In [132]:
# Prepare test documents, which are unlabeled.
#test = sqlContext.createDataFrame(test, ["labeli", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select( "label", "prediction")
evaluator=MulticlassClassificationEvaluator(predictionCol='prediction')
test_error=evaluator.evaluate(prediction)
print('The test accuracy obtained using Logistic regression is: ',test_error)

The test accuracy obtained using Logistic regression:  0.7960420289784099


In [133]:
prediction = cvModel.transform(train)
selected = prediction.select( "label", "prediction")
evaluator=MulticlassClassificationEvaluator(predictionCol='prediction')
train_error=evaluator.evaluate(prediction)
print('The training accuracy obtained using Logistic regression: ',train_error)

The training accuracy obtained using Logistic regression:  0.8105997920846342


# Multi-class Classification using random Forests

In [151]:
#Random forest model
rf = RandomForestRegressor(featuresCol="text")

# Chain indexer and forest in a Pipeline

pipeline_rf = Pipeline(stages=[tokenizer, remover, hashingTF, label_stringIdx, rf])

# Train model.  This also runs the indexer.
model = pipeline_rf.fit(train)


The training accuracy obtained using Random forests:  67.2789


# Test and Training error for Random Forests

In [152]:
# Make predictions on training dataset.
predictions = model.transform(train)

evaluator=MulticlassClassificationEvaluator(predictionCol='predictions')
train_accuracy=evaluator.evaluate(predictions)
print('The training accuracy obtained using Random forests: ',train_accuracy)

The training accuracy obtained using Random forests:  67.2789


In [154]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
predictions = cvModel.transform(test)
evaluator=MulticlassClassificationEvaluator(predictionCol='predictions')
test_accuracy=evaluator.evaluate(prediction)
print('The test accuracy obtained using Random Forests is: ',test_accuracy)

The test accuracy obtained using Random Forests is:  64.284977


# Test and repeat the feature engineering to further enhance accuracy.


Accuracies obtained for Logistic Regression<br>
Using HashingTF: <br>
    The test accuracy obtained using Logistic regression:  0.7960420289784099<br>
    The training accuracy obtained using Logistic regression:  0.8105997920846342<br>
Using IDF:<br>
    The test accuracy obtained using Logistic regression:  0.65.620640299<br>
    The training accuracy obtained using Logistic regression:  0.61.4768546342<br>

Accuracies obtained for Random Forests:<br>
Using HashingTF:<br>
    The training accuracy obtained using Random forests:  67.2789<br>
    The test accuracy obtained using Random Forests is:  64.284977<br>
Using IDF:<br>
    The training accuracy obtained using Random forests:  55.8219<br>
    The test accuracy obtained using Random Forests is:  51.54724<br>

# Results:<br>

    The Logistic Regression has given the best results when compared with 2 different algorithms: Random Forests.<br>
    
    The HashingTF has given better accuracy values in the pipeline part and used the same for feature engineering while tuning as compared against IDF
        