## Pair Programming
#### This file is another version solution, topic: breakfast, lunch, dinner.

#### For weekly cross check and efficient code review, we wrote two versions code, same structure with different methods and examples. Thus not only help us improve our programming level, but also hope to provide more references for other developers.

In [1]:
import re
import json
from csv import reader, writer, DictWriter
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
from bokeh.models import ColumnDataSource

# PART 1 - Working with Texts and Sentiments

## 1A - Parsing the Raw Tweets Texts

In [2]:
# takes in a string of text as its parameter 
# returns a list of the words in that string
def extract_words(text):
    pat = '[a-zA-Z]+'
    res = re.findall(pat,text)
    return [s.lower() for s in res]

In [3]:
extract_words("Eating noddles for dinner. I feel like I\u2019m back in college lol")

['eating',
 'noddles',
 'for',
 'dinner',
 'i',
 'feel',
 'like',
 'i',
 'm',
 'back',
 'in',
 'college',
 'lol']

## 1B - Using an Objective Sentiment Dictionary

In [4]:
# takes in as a parameter -- a file containing word sentiment scores (e.g.:AFINN-111.csv)
# return a dictionary whose keys are the words in and whose values are the numeric sentiment scores of those words
def load_sentiments(senti_file):
    f = open(senti_file, 'r')
    read = reader(f)
    result = {}

    for row in read:
        result[row[0]] = float(row[1])

    return result    

In [5]:
location = %pwd
print(location)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis


In [6]:
sentiment_file = location + "\AFINN-111.csv"
print(sentiment_file)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis\AFINN-111.csv


In [7]:
sentiment_dictionary = load_sentiments(sentiment_file)
print(sentiment_dictionary)



## 1C - Get the Sentiment Score of a "Clean" Sentence

In [8]:
# use 1A to divide string into words
# use 1B's dict to get value(score) of every word; 
# sum all the scores
def text_sentiment(text, senti_dic):
    wordsList = extract_words(text)

    sumScores = 0

    for word in wordsList:
        if word in senti_dic:
            sumScores = sumScores + senti_dic.get(word) 

    return sumScores

In [9]:
text_sentiment("Eating noddles for dinner. I feel like Im back in college lol", sentiment_dictionary)

5.0

# PART 2 - Working with Twitter JSON Files

## Preparation - The "data" folder contains .txt files we downloaded for further analysis using "twitter_stream.py" and Twitter API from the terminal.
#### The filenames also contain our search terms when requesting those tweets, e.g.: "breakfast".
#### We've taken out our own ACCESS_TOKEN_KEY following Twitter's security protocols.

In [10]:
data_location = location + "\data"
print(data_location)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis\data


In [11]:
twitter_stream_breakfast_file = data_location + "/12.9.2021_breakfast_count100.txt"
print(twitter_stream_breakfast_file)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis\data/12.9.2021_breakfast_count100.txt


In [12]:
twitter_stream_lunch_file = data_location + "/12.9.2021_lunch_count100.txt"
print(twitter_stream_lunch_file)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis\data/12.9.2021_lunch_count100.txt


In [13]:
twitter_stream_dinner_file = data_location + "/12.9.2021_dinner_count100.txt"
print(twitter_stream_dinner_file)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis\data/12.9.2021_dinner_count100.txt


## 2A - Tidy up the Twitter data from JSON string

In [14]:
# takes in a file containing tweets 
# returns a list of objects (dictionaries) representing those tweets
def load_tweets(tweets_file):
    resultList = []
    newDict = {}

    with open(tweets_file) as txtData:
        lines = txtData.readlines()
    for line in lines:
        line_dict = json.loads(line)

        new_dict = {key: line_dict[key] for key in ['created_at', 'text', 'retweet_count', 'favorite_count']}

        new_dict['text'] = new_dict['text'].lower()
        new_dict['user.screen_name'] = line_dict.get('user').get('screen_name')

        new_dict['entities.hashtags[i].text'] = []
        hashtagsList = line_dict.get('entities').get('hashtags')
        for hashtagDict in hashtagsList:
            new_dict['entities.hashtags[i].text'].append(hashtagDict.get('text'))

        resultList.append(new_dict)

    return resultList

In [15]:
# use "breakfast" topic tweets file as an example
load_tweets(twitter_stream_breakfast_file)

[{'created_at': 'Thu Dec 09 23:26:38 +0000 2021',
  'text': 'rt @himacomfort: look at the way this cutie sunflower ate her breakfast 💕 https://t.co/y0ojklad7l',
  'retweet_count': 67,
  'favorite_count': 0,
  'user.screen_name': 'NaruHinaHeart',
  'entities.hashtags[i].text': []},
 {'created_at': 'Thu Dec 09 23:44:42 +0000 2021',
  'text': '@supreeeme that’s what i’m having for dinner! i didn’t have a taste for anything but needed something on my stomach… https://t.co/ldzgki7cem',
  'retweet_count': 0,
  'favorite_count': 0,
  'user.screen_name': 'LOVESHANNIE87',
  'entities.hashtags[i].text': []},
 {'created_at': 'Thu Dec 09 23:44:42 +0000 2021',
  'text': "@apeach_here don't skip your breakfast, be happy and have a nice day",
  'retweet_count': 0,
  'favorite_count': 0,
  'user.screen_name': 'onyourwntr',
  'entities.hashtags[i].text': []},
 {'created_at': 'Thu Dec 09 23:44:39 +0000 2021',
  'text': "@nauudoyiee don't skip your breakfast and have a nice day!!",
  'retweet_count': 0,


## 2B - Calculate Tweet Popularity from the JSON File

In [16]:
# takes in a file containing tweets 
# returns a tuple containing the average number of _retweets_ (in the first entry)
# and the average number of _favorites_ (in the second entry)
def popularity(tweets_file):
    filelist = load_tweets(tweets_file)

    sumRetweets = 0
    countRetweets = 0
    sumFavorite = 0
    countFavorite = 0

    for tweetdict in filelist:
        # find retweets number
        # add it into sum
        sumRetweets = sumRetweets + tweetdict.get('retweet_count')
        countRetweets = countRetweets + 1
        # find favorites number
        # add it into its sum
        sumFavorite = sumFavorite + tweetdict.get('favorite_count')
        countFavorite = countFavorite + 1

    # calculate the average of retweets and fav, add them into tuple
    aveRetweets = float(sumRetweets / countRetweets)
    aveFavorite = float(sumFavorite / countFavorite)
    return tuple((aveRetweets, aveFavorite))    

#### It looks like "lunch" is the most popular topic today, because its average retweets times is much higher than other two.   But "breakfast" got highest average number of favorites...

In [17]:
breakfast_pop = popularity(twitter_stream_breakfast_file)
print(breakfast_pop)

(1201.32, 0.09)


In [18]:
lunch_pop = popularity(twitter_stream_lunch_file)
print(lunch_pop)

(4097.12, 0.02)


In [19]:
dinner_pop = popularity(twitter_stream_dinner_file)
print(dinner_pop)

(526.82, 0.05)


## What are the Trending Hashtags related to these topics?

## 2C - Trending Hashtags

In [20]:
# takes in a file containing tweets 
# returns a list of tuples, where each tuple contains a hashtag in the data set 
# and the number of times that hashtag was used. 

# This list will be ordered by the frequency, so that most popular hashtags are at the top.
def hashtag_counts(tweets_file):
    fileList = load_tweets(tweets_file)
    resDict = {}

    # find hashtag as key, count times as value, sort by value
    for tweetDict in fileList:
        hashtagsList = tweetDict.get('entities.hashtags[i].text')
        hashtagsSet = set(hashtagsList)

        for hashtag in hashtagsSet:
            hashtag = '#' + hashtag

            if hashtag in resDict:
                resDict[hashtag] = resDict[hashtag] + 1
            else:
                resDict[hashtag] = 1

    # transfer dict into tuple
    resTupleList = sorted(resDict.items(), key = lambda x : x[1], reverse = True)

    return resTupleList

In [21]:
# use "breakfast" topic tweets file as an example
hashtag_counts(twitter_stream_breakfast_file)

[('#TravisJapan', 5),
 ('#SnowMan', 5),
 ('#SixTONES', 5),
 ('#ザ少年倶楽部', 3),
 ('#美少年', 2),
 ('#少クラ', 2),
 ('#HiHiJets', 2),
 ('#lunch', 1),
 ('#food', 1),
 ('#Calabasasfood', 1),
 ('#Calabasas', 1),
 ('#foodie', 1),
 ('#foodlover', 1),
 ('#dinner', 1),
 ('#Calabasasfoodie', 1),
 ('#JumatBerkah', 1),
 ('#bakingbacon', 1),
 ('#GetNoticed', 1),
 ('#ZouainEntertainment', 1),
 ('#ZouainManor', 1),
 ('#PlaceWar', 1),
 ('#Metaverse', 1),
 ('#NFTGame', 1)]

# PART 3 - Working with Both Tweets and Sentiments

## 3A - What is the sentiment score for each tweet?

In [22]:
# takes in two parameters: the tweet data file and a sentiment data file 
# return a list of tweet objects:
# 1.similar to load_tweets() method
# 2.but each tweet object has an additional field - sentiment - holds the sentiment of the tweet's text. 

def tweet_sentiments(tweets_file, senti_file):
    # use 2A load_tweets() produce a list of dicts
    # every dicts in this list should add one key: 'sentiment'
    # calculate the value of 'sentiment' by 1C text_sentiment()
    tweetsList = load_tweets(tweets_file)
    sentimentDict = load_sentiments(senti_file)

    for tweetDict in tweetsList:
        tweetDict['sentiment'] = text_sentiment(tweetDict['text'], sentimentDict)

    return tweetsList  

In [23]:
# use "breakfast" topic tweets file as an example
list_of_breakfast_objects = tweet_sentiments(twitter_stream_breakfast_file, sentiment_file)
print(list_of_breakfast_objects)

[{'created_at': 'Thu Dec 09 23:26:38 +0000 2021', 'text': 'rt @himacomfort: look at the way this cutie sunflower ate her breakfast 💕 https://t.co/y0ojklad7l', 'retweet_count': 67, 'favorite_count': 0, 'user.screen_name': 'NaruHinaHeart', 'entities.hashtags[i].text': [], 'sentiment': 0}, {'created_at': 'Thu Dec 09 23:44:42 +0000 2021', 'text': '@supreeeme that’s what i’m having for dinner! i didn’t have a taste for anything but needed something on my stomach… https://t.co/ldzgki7cem', 'retweet_count': 0, 'favorite_count': 0, 'user.screen_name': 'LOVESHANNIE87', 'entities.hashtags[i].text': [], 'sentiment': 0}, {'created_at': 'Thu Dec 09 23:44:42 +0000 2021', 'text': "@apeach_here don't skip your breakfast, be happy and have a nice day", 'retweet_count': 0, 'favorite_count': 0, 'user.screen_name': 'onyourwntr', 'entities.hashtags[i].text': [], 'sentiment': 6.0}, {'created_at': 'Thu Dec 09 23:44:39 +0000 2021', 'text': "@nauudoyiee don't skip your breakfast and have a nice day!!", 'retwee

In [24]:
# prepare other two lists for part4 visualation use
list_of_lunch_objects = tweet_sentiments(twitter_stream_lunch_file, sentiment_file)
list_of_dinner_objects = tweet_sentiments(twitter_stream_dinner_file, sentiment_file)

## 3B - What's the average sentiment scores for a specific hashtag in the JSON File?

In [25]:
# takes as parameters a tweet data file, and a sentiment data file
# return a list of tuples, where each tuple contains a hashtag in the data set 
# and the sentiment of that hashtag, defined as: the average sentiment of the tweets that contain that hashtag.

# This list will be ordered by the sentiment, so that most positive hashtags are at the top.

# optional third parameter: query - representing a "search term".
# If the function is called with this parameter, then the returned list will only contain hashtags that have the parameter's value _`in`_ them
# e.g.: if the query is "breakfast" then "breakfast" and the hashtag string contains "breakfast" would be returned
def hashtag_sentiments(tweets_file, senti_file, query=None):
    tweetsList = tweet_sentiments(tweets_file, senti_file)
    hashtagFrequencyDict = {}
    totalSentimentDict = {}
    resDict = {}
    resQueryDict = {}

# find hashtag as key, count times as value, sort by value
    for tweetDict in tweetsList:
        # get hashtags
        hashtagsList = tweetDict.get('entities.hashtags[i].text')
        hashtagsSet = set(hashtagsList)

        #calculate the hashtag's frequency
        for hashtag in hashtagsSet:
            hashtag = '#' + hashtag

            if hashtag in hashtagFrequencyDict:
                hashtagFrequencyDict[hashtag] = hashtagFrequencyDict[hashtag] + 1
                totalSentimentDict[hashtag] = totalSentimentDict[hashtag] + tweetDict.get('sentiment')
            else:
                hashtagFrequencyDict[hashtag] = 1
                totalSentimentDict[hashtag] = tweetDict.get('sentiment')

    # for third parameter query
    for hashtag in totalSentimentDict:
#         if re.findall(query, hashtag.lower()):
        resDict[hashtag] = totalSentimentDict[hashtag] / hashtagFrequencyDict[hashtag]
        
    # transfer dict into tuple list
    resTupleList = sorted(resDict.items(), key = lambda x : x[1], reverse = True)

    return resTupleList  

In [26]:
breakfast_hashtag_lists = hashtag_sentiments(twitter_stream_breakfast_file, sentiment_file)
print(breakfast_hashtag_lists)

[('#JumatBerkah', 3.0), ('#lunch', 2.0), ('#food', 2.0), ('#Calabasasfood', 2.0), ('#Calabasas', 2.0), ('#foodie', 2.0), ('#foodlover', 2.0), ('#dinner', 2.0), ('#Calabasasfoodie', 2.0), ('#bakingbacon', 0.0), ('#GetNoticed', 0.0), ('#ZouainEntertainment', 0.0), ('#ZouainManor', 0.0), ('#PlaceWar', 0.0), ('#Metaverse', 0.0), ('#NFTGame', 0.0), ('#TravisJapan', -1.0), ('#ザ少年倶楽部', -1.0), ('#SnowMan', -1.0), ('#SixTONES', -1.0), ('#美少年', -1.0), ('#少クラ', -1.0), ('#HiHiJets', -1.0)]


In [27]:
dinner_hashtag_lists = hashtag_sentiments(twitter_stream_breakfast_file, sentiment_file)
print(dinner_hashtag_lists)

[('#JumatBerkah', 3.0), ('#lunch', 2.0), ('#food', 2.0), ('#Calabasasfood', 2.0), ('#Calabasas', 2.0), ('#foodie', 2.0), ('#foodlover', 2.0), ('#dinner', 2.0), ('#Calabasasfoodie', 2.0), ('#bakingbacon', 0.0), ('#GetNoticed', 0.0), ('#ZouainEntertainment', 0.0), ('#ZouainManor', 0.0), ('#PlaceWar', 0.0), ('#Metaverse', 0.0), ('#NFTGame', 0.0), ('#TravisJapan', -1.0), ('#ザ少年倶楽部', -1.0), ('#SnowMan', -1.0), ('#SixTONES', -1.0), ('#美少年', -1.0), ('#少クラ', -1.0), ('#HiHiJets', -1.0)]


## 3C - Is it true that positive tweets will get more retweets?
### Calculating the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)

In [28]:
# takes as parameters a tweet data file, and a sentiment data file
# return the correlation between the sentiment of a tweet and the number of times that tweet was retweeted.
def popular_sentiment(tweets_file, senti_file):
    tweetsList = tweet_sentiments(tweets_file, senti_file)

    sentimentList = []
    retweetedNumList = []
    # x - get every sentiment number, and save into list
    # y - get every retweets number, and save into list
    for tweetDict in tweetsList:
        sentimentList.append(tweetDict.get('sentiment'))
        retweetedNumList.append(tweetDict.get('retweet_count'))

    # corrcoef(x, y)
    x = np.array(sentimentList)
    y = np.array(retweetedNumList)

    return np.corrcoef(x, y)

In [29]:
popular_sentiment(twitter_stream_breakfast_file, sentiment_file)

array([[1.        , 0.01530874],
       [0.01530874, 1.        ]])

In [30]:
popular_sentiment(twitter_stream_lunch_file, sentiment_file)

array([[1.        , 0.04176552],
       [0.04176552, 1.        ]])

In [31]:
popular_sentiment(twitter_stream_dinner_file, sentiment_file)

array([[1.        , 0.15551894],
       [0.15551894, 1.        ]])

#### It seems the answer is yes related to this topic? And it shows that there is more correlation among "dinner" tweets than "lunch" and "breakfast" tweets.

# PART 4 - Data Visualization

## 4A - Write all the processed data into a csv file

In [32]:
headers = ['created_at', 'text', 'retweet_count', 'favorite_count', 'user.screen_name', 'entities.hashtags[i].text', 'sentiment', 'search_term', 'senti_ratio']

In [33]:
# write the processed tweets data into a csv file.
def export_to_csv(filename, list_of_tweet_objects, headers):
    with open(filename, "w", encoding='utf-8', newline='') as file: 
        csv_writer = DictWriter(file, fieldnames = headers)
        csv_writer.writeheader()
        for obj in list_of_tweet_objects:
            csv_writer.writerow(obj)

In [34]:
csv_location = location + '/exported_csv'
breakfast_filename = csv_location + '/breakfast.csv'
lunch_filename = csv_location + '/lunch.csv'
dinner_filename = csv_location + '/dinner.csv'
print(breakfast_filename)

E:\jupyter_files_save\DAMG_6105\Twitter_Sentiment_Analysis/exported_csv/breakfast.csv


## 4B - Attach search_term and sentiment ratio to the objects

In [35]:
def attach_search_senti_ratio(list_of_tweet_objects, term, totalscore):
    for obj in list_of_tweet_objects:
        obj['search_term'] = term
        obj['senti_ratio'] = obj['sentiment'] / totalscore

In [36]:
attach_search_senti_ratio(list_of_breakfast_objects, "breakfast", popular_sentiment(twitter_stream_breakfast_file, sentiment_file)[1])
export_to_csv(breakfast_filename, list_of_breakfast_objects, headers)

In [37]:
attach_search_senti_ratio(list_of_lunch_objects, "lunch", popular_sentiment(twitter_stream_lunch_file, sentiment_file)[1])
export_to_csv(lunch_filename, list_of_lunch_objects, headers)

In [38]:
attach_search_senti_ratio(list_of_dinner_objects, "dinner", popular_sentiment(twitter_stream_dinner_file, sentiment_file)[1])
export_to_csv(dinner_filename, list_of_dinner_objects, headers)

## 4C - Create suitable dataframes with Pandas

In [39]:
df_1 = pd.read_csv(breakfast_filename)
df_2 = pd.read_csv(lunch_filename)
df_3 = pd.read_csv(dinner_filename)
df_list = [df_1,df_2, df_3]
merged_df = pd.concat(df_list, axis = 0)
merged_df.shape

(300, 9)

In [40]:
merged_df.head()

Unnamed: 0,created_at,text,retweet_count,favorite_count,user.screen_name,entities.hashtags[i].text,sentiment,search_term,senti_ratio
0,Thu Dec 09 23:26:38 +0000 2021,rt @himacomfort: look at the way this cutie su...,67,0,NaruHinaHeart,[],0.0,breakfast,[0. 0.]
1,Thu Dec 09 23:44:42 +0000 2021,@supreeeme that’s what i’m having for dinner! ...,0,0,LOVESHANNIE87,[],0.0,breakfast,[0. 0.]
2,Thu Dec 09 23:44:42 +0000 2021,"@apeach_here don't skip your breakfast, be hap...",0,0,onyourwntr,[],6.0,breakfast,[391.9328567 6. ]
3,Thu Dec 09 23:44:39 +0000 2021,@nauudoyiee don't skip your breakfast and have...,0,0,Naaajefff,[],3.0,breakfast,[195.96642835 3. ]
4,Thu Dec 09 23:44:39 +0000 2021,"rt @lovisdeli: crepes for brunch, yes, please!...",1,0,CalabasasAlive,"['food', 'foodie', 'foodlover', 'lunch', 'dinn...",2.0,breakfast,[130.64428557 2. ]


## 4D - Visualize data with Bokeh

In [41]:
# using Bokeh:
# source = ColumnDataSource(data=merged_df)
# p = figure()
# p.circle(x='senti_ratio', y='sentiment', source=source)
# show(p)
SEARCH_TERMS = sorted(merged_df.search_term.unique())
MARKERS = ['hex', 'circle_x', 'triangle']
p = figure(title = "Correlation", background_fill_color="#fafafa")
p.xaxis.axis_label = 'Sentiment'
p.yaxis.axis_label = 'senti_ratio'
p.scatter("sentiment", "retweet_count", source=merged_df,
          legend_group="search_term", fill_alpha=0.4, size=12,
          marker=factor_mark('search_term', MARKERS, SEARCH_TERMS),
          color=factor_cmap('search_term', 'Category10_3', SEARCH_TERMS))

p.legend.location = "top_left"
p.legend.title = "Search Terms"

show(p)

In [42]:
tags = []
scores = []
for hashtag, score in breakfast_hashtag_lists:
    tags.append(hashtag)
    scores.append(score)

p = figure(x_range=tags, width = 2000, height=850, title="Hashtags",
           toolbar_location=None, tools="")
p.vbar(x=tags, top=scores, width=0.6)

p.xgrid.grid_line_color = None
p.xgrid.minor_grid_line_width = 100
p.ygrid.band_hatch_scale = 100.0
p.xaxis.major_label_orientation = 1
p.xaxis.major_label_text_font_size = "16px"

show(p)