In [1]:
sc

In [2]:
rdd=sc.textFile("C:/Users/Abhishek/Downloads/data.csv")

In [3]:
rdd.take(5)

['ItemID,Sentiment,SentimentText',
 '1,0,                     is so sad for my APL friend.............',
 '2,0,                   I missed the New Moon trailer...',
 '3,1,              omg its already 7:30 :O',
 "4,0,          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)..."]

In [4]:
header=rdd.first()
header

'ItemID,Sentiment,SentimentText'

In [5]:
#Removing the header from the RDD

rdd=rdd.filter(lambda line:line!=header)

In [6]:
rdd.take(5)

['1,0,                     is so sad for my APL friend.............',
 '2,0,                   I missed the New Moon trailer...',
 '3,1,              omg its already 7:30 :O',
 "4,0,          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...",
 '5,0,         i think mi bf is cheating on me!!!       T_T']

In [7]:
rdd=rdd.map(lambda line:line.split(","))

In [8]:
rdd.take(2)

[['1', '0', '                     is so sad for my APL friend.............'],
 ['2', '0', '                   I missed the New Moon trailer...']]

In [9]:
#Removing S.No. from the RDD

rdd=rdd.map(lambda line:line[1:]) 

In [10]:
rdd.take(2)

[['0', '                     is so sad for my APL friend.............'],
 ['0', '                   I missed the New Moon trailer...']]

In [11]:
import re
rg1=re.compile(r'\d+') #For eliminating digits
rg2=re.compile(r'[\"\',-.:;&!#$%^*<>=?~_/]') #For eliminating all special characters

In [12]:
import nltk

from nltk.stem.wordnet import WordNetLemmatizer
lm = WordNetLemmatizer()

#from nltk.stem.snowball import SnowballStemmer
#stem = nltk.stem.SnowballStemmer('english')

#Just using Lemmatization

In [13]:
def transf(tweet):
    tweet=tweet.lower()
    tweet=rg1.sub("",tweet) 
    tweet=rg2.sub("",tweet)
    tweet=tweet.strip().split(" ") #Separating the words
    
    for word in tweet:
        word=lm.lemmatize(word,"v")
        #word=stem.stem(word) 
    
    return tweet

In [14]:
from pyspark.sql import Row

df=rdd.map(lambda line:Row(label=line[0],
                          words=transf(line[1]))).toDF()

In [15]:
#Another way to convert to DF

#rdd_new=rdd.map(lambda line:{"label":line[0],"words":transf(line[1])})
#df=rdd_new.toDF()

In [16]:
df.show()

+-----+--------------------+
|label|               words|
+-----+--------------------+
|    0|[is, so, sad, for...|
|    0|[i, missed, the, ...|
|    1|[omg, its, alread...|
|    0|[omgaga, im, sooo...|
|    0|[i, think, mi, bf...|
|    0|[or, i, just, wor...|
|    1|[juuuuuuuuuuuuuuu...|
|    0|[sunny, again, , ...|
|    1|[handed, in, my, ...|
|    1|[hmmmm, i, wonder...|
|    0|[i, must, think, ...|
|    1|[thanks, to, all,...|
|    0|[this, weekend, h...|
|    0|[jb, isnt, showin...|
|    0|[ok, thats, it, y...|
|    0|[lt, this, is, th...|
|    0|[awhhe, man, im, ...|
|    1|[feeling, strange...|
|    0|[huge, roll, of, ...|
|    0|[i, just, cut, my...|
+-----+--------------------+
only showing top 20 rows



In [17]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filteredWords")
df=remover.transform(df)

#Removed all the stopwords

In [18]:
#Only using TF as IDF lead to lower accuracy. This may be due to the fact that 
#IDF tries to find the tags unique to each tweet. Whereas, in case of sentiments,
#many similar words might lead to a positive sentiment in the tweets.
#Ex. the word "happy" may denote a positive tweet even if it appears in many tweets.

In [20]:
from pyspark.ml.feature import HashingTF

hashingTF = HashingTF(inputCol="filteredWords", outputCol="features")
rescaledData = hashingTF.transform(df)

from pyspark.sql.types import *
rescaledData=rescaledData.withColumn("label",rescaledData["label"].cast(IntegerType()))

train,test=rescaledData.randomSplit([0.7,0.3]) #30% kept as test data

In [21]:
#Now trying out various classification techniques

In [23]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)

# Fit the model
lrModel = lr.fit(train)

predictions=lrModel.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acc = evaluator.evaluate(predictions)
print("Test set Accuracy = %g" % (acc))

Test set Accuracy = 0.664379


In [24]:
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(labelCol="label", featuresCol="features", maxIter=10)

# Fit the model
lsvcModel = lsvc.fit(train)

predictions=lsvcModel.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acc = evaluator.evaluate(predictions)
print("Test set Accuracy = %g" % (acc))

Test set Accuracy = 0.702819


In [25]:
from pyspark.ml.classification import NaiveBayes

#Creating the trainer and set its parameters
nb = NaiveBayes(modelType="multinomial")

#Training the model
model = nb.fit(train)

predictions = model.transform(test)

#Computing accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.7218390034824538


In [26]:
#Naive bayes led to the best accuracy of on the test set. Thus we will save that model for future use.

In [27]:
model.save("TwitterSentimentNB.model")