In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tweet-data/tweets.csv


In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 29 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 62.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612244 sha256=9e821af643685ad482ee127db30f2240d09143157503f2a04d1b54c8c9f077c3
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1
You

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [4]:
appName="Sentimen Analysis Spark"
spark=SparkSession.builder.master('local').appName(appName).getOrCreate()

In [5]:
tweet_csv=spark.read.csv("../input/tweet-data", inferSchema=True, header=True)
tweet_csv.show(truncate=False, n=3)

+------+---------+---------------+---------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                    |
+------+---------+---------------+---------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant|
|1804  |0        |Sentiment140   |this music is really bad #myband |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down  |
+------+---------+---------------+---------------------------------+
only showing top 3 rows



In [6]:
data=tweet_csv.select(
    "SentimentText",col("Sentiment").cast("Int").alias("label"))
data.show(truncate=False, n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |0    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



In [7]:
#Split training and Testing
split_data=data.randomSplit([0.7,0.3])
train=split_data[0]

#label in test renamed to true label
test=split_data[1].withColumnRenamed("label","true_label")
train_rows=train.count()
test_rows=test.count()

print("Total train :",train_rows)
print("Total test :", test_rows)

Total train : 1350
Total test : 582


In [8]:
#Prepare data
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(train)
tokenizedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+
|SentimentText                    |label|SentimentWords                         |
+---------------------------------+-----+---------------------------------------+
|I adore cheese #bestever         |1    |[i, adore, cheese, #bestever]          |
|I adore cheese #favorite         |1    |[i, adore, cheese, #favorite]          |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |
|I adore cheese #toptastic        |1    |[i, adore, cheese, #toptastic]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|
+---------------------------------+-----+---------------------------------------+
only showing top 5 rows



In [9]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+---------------------------------+-----+---------------------------------------+------------------------------------+
|SentimentText                    |label|SentimentWords                         |MeaningfulWords                     |
+---------------------------------+-----+---------------------------------------+------------------------------------+
|I adore cheese #bestever         |1    |[i, adore, cheese, #bestever]          |[adore, cheese, #bestever]          |
|I adore cheese #favorite         |1    |[i, adore, cheese, #favorite]          |[adore, cheese, #favorite]          |
|I adore cheese #thumbs-up        |1    |[i, adore, cheese, #thumbs-up]         |[adore, cheese, #thumbs-up]         |
|I adore cheese #toptastic        |1    |[i, adore, cheese, #toptastic]         |[adore, cheese, #toptastic]         |
|I adore classical music #bestever|1    |[i, adore, classical, music, #bestever]|[adore, classical, music, #bestever]|
+---------------------------------+-----+-------

In [10]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrain = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrain.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[1689,91011,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
|1    |[adore, cheese, #thumbs-up]|(262144,[1689,88825,100089],[1.0,1.0,1.0]) |
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



In [11]:
#Training Model
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrain)
print ("Training Done")

Training Done


In [12]:
#Prepare Testing data
tokenizedTest = tokenizer.transform(test)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest)
numericTest.show(truncate=False, n=2)

+-------------------------+----------+------------------------------+---------------------------+-------------------------------------------+
|SentimentText            |true_label|SentimentWords                |MeaningfulWords            |features                                   |
+-------------------------+----------+------------------------------+---------------------------+-------------------------------------------+
|I adore cheese #brilliant|1         |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|I adore cheese #loveit   |1         |[i, adore, cheese, #loveit]   |[adore, cheese, #loveit]   |(262144,[1689,100089,254974],[1.0,1.0,1.0])|
+-------------------------+----------+------------------------------+---------------------------+-------------------------------------------+
only showing top 2 rows



In [13]:
#Prediction
raw_prediction = model.transform(numericTest)
raw_prediction.printSchema()


root
 |-- SentimentText: string (nullable = true)
 |-- true_label: integer (nullable = true)
 |-- SentimentWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- MeaningfulWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
Final_prediction = raw_prediction.select("MeaningfulWords", "prediction", "true_label")
Final_prediction.show(n=4, truncate = False)


+------------------------------------+----------+----------+
|MeaningfulWords                     |prediction|true_label|
+------------------------------------+----------+----------+
|[adore, cheese, #brilliant]         |1.0       |1         |
|[adore, cheese, #loveit]            |1.0       |1         |
|[adore, classical, music, #favorite]|1.0       |1         |
|[adore, coffee, #bestever]          |1.0       |1         |
+------------------------------------+----------+----------+
only showing top 4 rows



In [15]:
Total_True=Final_prediction.filter(Final_prediction['prediction']==Final_prediction['true_label']).count()
Alldata=Final_prediction.count()
Accuracy=Total_True/Alldata
print("Accuracy Score is:", Accuracy*100, '%')

Accuracy Score is: 98.79725085910653 %
