# Step 1

Defining environment variables

In [1]:
import os
os.environ["JAVA_HOME"] = "C:/Java/jdk1.8.0_261"
os.environ["SPARK_HOME"] = "D:/Spark/spark-3.0.1-bin-hadoop2.7"

Importing necessary libraries

In [2]:
import tweepy
import numpy as np
import pandas as pd
import re
import nltk

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

Credential for Twitter Developer API

In [3]:
ACCESS_TOKEN = 'ACCESS_TOKEN'
ACCESS_SECRET = 'ACCESS_SECRET'
CONSUMER_KEY = 'CONSUMER_KEY'
CONSUMER_SECRET = 'CONSUMER_SECRET'

Authorizing tweepy with credentials

In [4]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)

# Step 2

Defining the dataframe to store the incoming tweets

In [5]:
df = pd.DataFrame(columns = ['Tweets', 'tweet_date'])

Function to store the tweet stream in the defined dataframe

In [6]:
def stream(data, file_name):
    i = 0
    for tweet in tweepy.Cursor(api.search, q=data, count=100, lang='en').items():
        print(i, end='\r')
        df.loc[i, 'Tweets'] = tweet.text
        df.loc[i, 'tweet_date'] = tweet.created_at
        df.to_excel('{}.xlsx'.format(file_name))
        i+=1
        if i == 1200:
            break
        else:
            pass

Start streaming tweets and storing in the defined dataframe, save locally in a file named Data

In [7]:
stream(data = ['#hospital'], file_name = 'Data')

1199

Read the stored file and performed some data cleaning on the stored tweets

In [8]:
data = pd.read_excel("Data.xlsx", inferSchema='')

def clean_tweet(tweet):
    return ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', tweet).split())

data['clean_tweet'] = df['Tweets'].apply(lambda x: clean_tweet(x).lower())
data.drop('Unnamed: 0', inplace=True, axis=1)
data.head()

Unnamed: 0,Tweets,tweet_date,clean_tweet
0,RT @WorldofAnimal1: 🐾Beautiful Olive!!❤\n#dogs...,2020-10-20 17:27:57,rt beautiful olive dogsoftwitter doglover vete...
1,RT @WorldofAnimal1: 🐾Meet Gorgeous Sam!!😃\n#Ca...,2020-10-20 17:27:52,rt meet gorgeous sam catsontwitter catsoftwitt...
2,Injured \n\n#spooktober #digitalart #digitalil...,2020-10-20 17:19:35,injured spooktober digitalart digitalillustrat...
3,This video gives a high-level introduction to ...,2020-10-20 17:15:03,this video gives a high level introduction to ...
4,RT @WilliamsRef: Take a look at our pharmacy c...,2020-10-20 17:12:00,rt take a look at our pharmacy coldroom with b...


Converting to spark dataframe

In [9]:
sdf = spark.createDataFrame(data)
sdf.show()

+--------------------+-------------------+--------------------+
|              Tweets|         tweet_date|         clean_tweet|
+--------------------+-------------------+--------------------+
|RT @WorldofAnimal...|2020-10-20 17:27:57|rt beautiful oliv...|
|RT @WorldofAnimal...|2020-10-20 17:27:52|rt meet gorgeous ...|
|Injured 

#spookt...|2020-10-20 17:19:35|injured spooktobe...|
|This video gives ...|2020-10-20 17:15:03|this video gives ...|
|RT @WilliamsRef: ...|2020-10-20 17:12:00|rt take a look at...|
|RT @WorldofAnimal...|2020-10-20 17:08:17|rt beautiful oliv...|
|RT @WorldofAnimal...|2020-10-20 17:08:12|rt meet gorgeous ...|
|CM Inaugurates Su...|2020-10-20 17:08:00|cm inaugurates su...|
|Don't miss our la...|2020-10-20 17:05:10|don t miss our la...|
|RT @WorldofAnimal...|2020-10-20 17:02:28|rt meet gorgeous ...|
|Emergicare specia...|2020-10-20 17:02:18|emergicare specia...|
|Ride along with M...|2020-10-20 17:02:16|ride along with m...|
|RT @WorldofAnimal...|2020-10-20 16:59:5

Total number of tweets

In [10]:
sdf.count()

1200

Converting  the tweet column to list for tokenizing and generating sentiment score

In [11]:
text = sdf.select("clean_tweet").rdd.flatMap(lambda x: x).collect()

# Step 3

Read the Bing Liu Lexicon

In [13]:
def isNotNull(value):
    return value is not None and len(value)>0

dict_pos = []
dict_neg = []
f = open('IDS 561/Assignemnt3/opinion_lexicon/negative-words.txt','r')
for line in f:
    t= line.strip().lower();
    if (isNotNull(t)):
        dict_neg.append(t)
f.close()

f = open('IDS 561/Assignemnt3/opinion_lexicon/positive-words.txt','r')
for line in f:
    t = line.strip().lower();
    if (isNotNull(t)):
        dict_pos.append(t)
f.close()

Assign sentiment score from the Bing Liu lexicon to each tweet and store in a list

In [14]:
sentiment_analysis = []
for i in range(sdf.count()):
    tokens = nltk.word_tokenize(text[i])
    neg_cnt = 0
    pos_cnt = 0
    for neg in dict_neg:
        if (neg in tokens):
            neg_cnt = neg_cnt +1
    for pos in dict_pos:
        if (pos in tokens):
            pos_cnt = pos_cnt +1
    sentiment_analysis.append(pos_cnt - neg_cnt)     

Creating and merging dataframe which includes the sentiment score and the tweets

In [15]:
from pyspark.sql.types import IntegerType, StructType, StructField, LongType

#create a dataframe for the sentiment score list
df2 = spark.createDataFrame(sentiment_analysis, IntegerType())

#merging the sentiment score dataframe with the original dataframe
def with_column_index(sdf): 
    new_schema = StructType(sdf.schema.fields + [StructField("ColumnIndex", LongType(), False),])
    return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF(schema=new_schema)

df1_ci = with_column_index(sdf)
df2_ci = with_column_index(df2)
df_joined = df1_ci.join(df2_ci, df1_ci.ColumnIndex == df2_ci.ColumnIndex, 'inner').drop("ColumnIndex")
df_joined.show()

+--------------------+-------------------+--------------------+-----+
|              Tweets|         tweet_date|         clean_tweet|value|
+--------------------+-------------------+--------------------+-----+
|Surely Jonathan V...|2020-10-20 16:36:14|surely jonathan v...|    0|
|COVID-19 Update: ...|2020-10-20 16:35:09|covid 19 update g...|    1|
|Remaining HIPAA c...|2020-10-19 16:00:00|remaining hipaa c...|    2|
|BEST of the WEEK:...|2020-10-18 11:00:35|best of the week ...|    1|
|"Luna in the Gard...|2020-10-20 15:20:00|luna in the garde...|    1|
|Propofol sedation...|2020-10-20 11:07:02|propofol sedation...|    1|
|The #healthcare i...|2020-10-19 19:03:52|the healthcare in...|    1|
|RT @AFTER_MOUSE: ...|2020-10-19 12:58:18|rt mouse pong tou...|    1|
|Time for Marketin...|2020-10-19 11:48:55|time for marketin...|    0|
|RT @NewsRajwar: T...|2020-10-18 07:25:49|rt the adept team...|    0|
|Congrats to Henle...|2020-10-20 09:30:48|congrats to henle...|    0|
|RT @DrDMacaskill:..

Selecting and renaming columns as per requirement

In [16]:
df_final = df_joined.select('Tweets', 'value')
df_final = df_final.withColumnRenamed("Tweets", "Tweet_content").withColumnRenamed("value", "Sentiment_Score")
df_final.show()

+--------------------+---------------+
|       Tweet_content|Sentiment_Score|
+--------------------+---------------+
|Surely Jonathan V...|              0|
|COVID-19 Update: ...|              1|
|Remaining HIPAA c...|              2|
|BEST of the WEEK:...|              1|
|"Luna in the Gard...|              1|
|Propofol sedation...|              1|
|The #healthcare i...|              1|
|RT @AFTER_MOUSE: ...|              1|
|Time for Marketin...|              0|
|RT @NewsRajwar: T...|              0|
|Congrats to Henle...|              0|
|RT @DrDMacaskill:...|              0|
|RT @BUMBLEance: T...|              1|
|RT @HyperfineR: T...|              0|
|Who will tackle #...|             -1|
|RT @NewsRajwar: T...|              0|
|RT @butterflyswim...|             -2|
|RT @ejmarkow: Vol...|              0|
|Improving #Cybers...|              2|
|RT @HyperfineR: T...|              0|
+--------------------+---------------+
only showing top 20 rows



Converting to pandas dataframe and saving locally as a csv

In [17]:
pandas_df = df_final.select("*").toPandas()
pandas_df.to_csv(r'final_output.csv', index = False, header=True)
pandas_df.head()

Unnamed: 0,Tweet_content,Sentiment_Score
0,Surely Jonathan Van-Tam or #JVT as he's being ...,0
1,COVID-19 Update: Global Electronic Health Reco...,1
2,Remaining HIPAA compliant is one of the top pr...,2
3,BEST of the WEEK: North Wing extension of Rigs...,1
4,"""Luna in the Garden"" \nhttps://t.co/Z76wZkdk9V...",1
