# Syria Twitter Data Processing, Visualization, and NLP Analysis 


In [4]:
# import core libraries 
import datetime
import json
import csv
import ast
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import lxml.html
import pandas
from pandas.io.json import json_normalize
from pandas import ExcelWriter

# import visualizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


In [5]:
# new line json streamer for processing the data -> MAIN FUNCTION FOR ANALYSIS
# json objects must be stored per line, not in an arrary 
# my dataset contains separate JSON object on each line
def nljson_generator(json_path):
    with open(json_path) as file:
        for line in file: 
            yield json.loads(line)

# get the total number of tweet json objects in dataset
def count_tweet_objects(json_path):
    count = 0
    for tweet in nljson_generator(json_path): 
        count+=1
    return count

# read n number of json objects from the tweets dataset
def get_n_tweets(json_path, n_tweets):
    data = []
    for line in islice(nljson_generator(json_path), n_tweets):
        data.append(line)
    return(data)  


In [7]:
# set directory path data
twitter_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/twitter_data/')

# set tweets_no_rts_json
tweets_no_rts_json = twitter_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.json'


### Identify tweet attributes for analysis
I'm going identify all important attributes for this project and list them below. This will be a part of the data cleaning/processing section. I referenced the Twitter API official documentation to understand all of the fields and identify what information I want to pull from each tweet. Here's the link: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

**Twitter Tweet Attributes** <br /> 
Tweets are the basic atomic building block of all things Twitter. Tweets are also known as “status updates.” 
* `created_at`: UTC time when this Tweet was created.
* `id`: The integer representation of the unique identifier for this Tweet. 
* `id_str`: The string representation of the unique identifier for this Tweet.
* `text`: The actual UTF-8 text of the tweet.
* `source`: Utility used to post the Tweet as an HTML-formatted string.
* `retweet_count`: Number of times the tweet was retweeted.
* `favorite_count`: Number of times the tweet was favorited.
* `lang`: When present, indicates a BCP 47 language identifier corresponding to the machine-detected language of the Tweet text, or `und` if no language could be detected. 
* `coordinates`: Represents the geographic location of this Tweet as reported by the user or client application.
* `geo`: This deprecated attribute has its coordinates formatted as [lat, long], while all other Tweet geo is formatted as [long, lat].
* `place`: When present, indicates that the tweet is associated (but not necessarily originating from)

**Twitter User Attributes** <br />
The `user` object contains public Twitter account metadata and describes the account.
* `name`: The name of the user, as they’ve defined it.
* `screen_name`: The screen name, handle, or alias that this user identifies themselves with.
* `location`: The user-defined location for this account’s profile. 
* `verified`: When true, indicates that the user has a verified account.
* `followers_count`: The number of followers this account currently has. 
* `utc_offset`: To calculate the time relative to the user's timezone.

**Twitter Tweet Entities Attributes** <br /> 
The `entities` section provides arrays of common things included in Tweets: hashtags, user mentions, links, stock tickers (symbols), Twitter polls, and attached media.
* `hashtags`: Represents hashtags which have been parsed out of the Tweet text. (e.g. "#Syria" appears as "Syria")
* `user_mentions`: Represents other Twitter users mentioned in the text of the Tweet. 
* `media`: Represents media elements uploaded with the Tweet. 
* `type`: The actual type of media is specified in the media.
* `url`: the expanded version of urls included in the tweet (e.g. "https://t.co/ljRAxRICTr" is the shortened URL in the tweet and the full url is https://www.nytimes.com/reuters/2017/03/21/world/middleeast/21reuters-israel-syria-iran.html)
* `title`: HTML title for the link.
* `description`: HTML description for the link.


In [8]:
# data cleaning functions 
def clean_hashtags(tweet_hashtags):
    """
    Turns data with any number of hashtags like this:
    'hashtags': [{'text': 'FAKENEWS', 'indices': [80, 89]}]
    to a list like this -> ['FAKENEWS']
    """
    hashtags_cleaned = []
    if len(tweet_hashtags) >= 1:
        for tag in range(len(tweet_hashtags)):
            hashtag_text = tweet_hashtags[tag]['text'].lower()
            hashtags_cleaned.append(hashtag_text)
    return hashtags_cleaned

def clean_source(source):
    """
    Turns data including the source and some html like this:
    <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a> 
    to a string like this -> 'Twitter for Android'
    """
    try:
        raw = lxml.html.document_fromstring(source)
        raw = raw.text_content()
    except:
        return None
    return raw

def string_to_datetime(tweet_date):
    """
    Turns a datetime string like this: 
    '2017-07-06T18:34:37.000Z' 
    to a Python datetime object like this -> 2017-07-06 18:34:41
    """
    return datetime.datetime.strptime(tweet_date, "%Y-%m-%dT%H:%M:%S.%fZ")


def clean_user_mentions(user_mentions):
    """
    Turns data like this:
    [{'screen_name': 'TheSwogBlog', 'name': 'The Swog Blog', 'id': 7.130089490967429e+17, 
    'id_str': '713008949096742912', 'indices': [0, 12]}]
    into to a list -> ['TheSwogBlog']
    """
    user_mentions_cleaned = []
    if len(user_mentions) >= 1:
        for user in range(len(user_mentions)):
            mention = user_mentions[user]['screen_name']
            user_mentions_cleaned.append(mention)
    return user_mentions_cleaned

def clean_geo_and_coordinates(tweet_geo_or_coordinates):
    """
    Extracts elements of a dictionary like:  
    {'type': 'Point', 'coordinates': [35.3612, 31.3893]}
    into a list like this -> [35.3612, 31.3893]
    """
    tweet_coordinates = None
    if tweet_geo_or_coordinates:
        tweet_coordinates = tweet_geo_or_coordinates['coordinates']
    return tweet_coordinates

def clean_places(tweet_place):
    """
    Extracts elements of a dictionary like:  
    {'id': '65b23b0045f450f6', 'url': 'https://api.twitter.com/1.1/geo/id/65b23b0045f450f6.json', 
    'place_type': 'city', 'name': 'Kingston upon Thames', 'full_name': 'Kingston upon Thames, London', 
    'country_code': 'GB', 'country': 'United Kingdom', 
    'bounding_box': {'type': 'Polygon', 'coordinates': [[[-0.322917, 51.34286], 
    [-0.322917, 51.437266], [-0.234011, 51.437266], [-0.234011, 51.34286]]]}, 'attributes': {}}
    
    Returns a tuple of the dictionary elements: 
    ('city', 'Kingston upon Thames', 'Kingston upon Thames, London', 'GB', 'United Kingdom', 'Polygon', 
    [[-0.322917, 51.34286], [-0.322917, 51.437266], [-0.234011, 51.437266], [-0.234011, 51.34286]])
    
    """
    place_type = name = full_name = country_code = None
    country = bounding_box_type = bounding_box_coordinates = None

    if tweet_place:
        place_type = tweet_place['place_type']
        name = tweet_place['name']
        full_name = tweet_place['full_name']
        country_code = tweet_place['country_code']
        country = tweet_place['country']
        bounding_box_type = tweet_place['bounding_box']['type']
        bounding_box_coordinates = tweet_place['bounding_box']['coordinates'][0]
    
    return place_type, name, full_name, country_code, \
           country, bounding_box_type, bounding_box_coordinates

def clean_entities_url(tweet_entities):
    """
    Extracts the expanded url from a dictionary like:  
    [{'url': 'https://t.co/Eiqt4Gu4hs', 'expanded_url': 'https://twitter.com/i/web/status/883031529303232512', 
    'display_url': 'twitter.com/i/web/status/8…', 'indices': [121, 144]}]
    To a string like this -> 'https://twitter.com/i/web/status/883031529303232512'
    """
    tweet_entities_urls_expanded_url = None
    if tweet_entities['urls']:
        tweet_entities_urls_expanded_url = tweet_entities['urls'][0]['expanded_url']
    return tweet_entities_urls_expanded_url

def clean_extended_entities(tweet):
    """
    Extracts elements of the tweet extended_entities attribute like:  
    {'expanded_url': 'https://twitter.com/Reuters/status/883028019266281472/video/1', 'type': 'video'}
    """
    tweet_extended_entities_media_type = None
    tweet_extended_entities_media_url = None
    tweet_extended_entities = tweet.get('extended_entities', None)
    if tweet_extended_entities:
        tweet_entended_entities_media = tweet_extended_entities.get('media', None)
        if tweet_entended_entities_media:
            tweet_extended_entities_media_type = tweet_entended_entities_media[0]['type']       
            tweet_extended_entities_media_url = tweet_entended_entities_media[0]['expanded_url']
    return tweet_extended_entities_media_type, tweet_extended_entities_media_url


In [9]:
# write twitter data to flatten csv file
def write_tweets_to_csv(tweets_json, outfile):
    # the headers are the fields that we identified in step 4
    headers = ['tweet_id', 'tweet_id_str', 'tweet_created_at', 'tweet_geo', 'tweet_coordinates', 
               'place_type', 'place_name', 'place_full_name', 'place_country', 'place_country_code',
               'bounding_box_type', 'bounding_box_coordinates', 'tweet_lang', 
               'tweet_source', 'tweet_text', 'tweet_retweet_count', 'tweet_favorite_count', 
               'user_id_str', 'user_screen_name', 'user_name', 'user_location', 'user_utc_offset', 
               'user_verified', 'user_followers_count', 'tweet_hashtags', 'tweet_user_mentions', 
               'tweet_expanded_url', 'tweet_media_type', 'tweet_media_url']
            

In [58]:
# tweets_no_rts_csv file path
tweets_no_rts_csv = twitter_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.csv'


In [59]:
# load tweets into dataframe from csv file
tweets_no_rts_df = pandas.read_csv(tweets_no_rts_csv, header=0,
                                   parse_dates=['tweet_created_at'], 
                                   date_parser=string_to_datetime)


In [61]:
# print dataframe
tweets_no_rts_df.shape


(1160088, 29)

In [62]:
test = 'Russian warplanes bombed the town of Suha in Hama province Neither fatalities nor injuries are reported'
event_list = test.lower().split()
print(test_list)

['russian', 'warplanes', 'bombed', 'the', 'town', 'of', 'suha', 'in', 'hama', 'province', 'neither', 'fatalities', 'nor', 'injuries', 'are', 'reported']


In [67]:
import re
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def check_tweet(tweet):
    
    tweet = clean_tweet(tweet.lower())
    tweet = tweet.split()
    for word in tweet: 
        if word in event_list:
            return True
        return False 
    
tweets_no_rts_df['Test'] = tweets_no_rts_df["tweet_text"].apply(lambda tweet: check_tweet(tweet))


In [74]:
event_date = pandas.to_datetime('2017-07-06').date()
print(event_date)


2017-07-06


In [75]:
def testing(tweet_date):
    if tweet_date.date() == event_date:
        return True
    return False 

tweets_no_rts_df['Test_Date'] = tweets_no_rts_df["tweet_created_at"].apply(lambda tweet: testing(tweet))



In [76]:
tweets_no_rts_df['Test_Date'].value_counts()

False    1153599
True        6489
Name: Test_Date, dtype: int64

In [79]:
df_filtered = tweets_no_rts_df[(tweets_no_rts_df.Test_Date == True) & (tweets_no_rts_df.Test == True)]

In [84]:
df_filtered.to_csv('/Users/adamstueckrath/Desktop/twitter_data/test.csv')

In [78]:
test_df = tweets_no_rts_df[tweets_no_rts_df['Test_Date'] == True & tweets_no_rts_df['Test'] == True]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Top Tweet hashtags

In [110]:
# get top hashtags
top_hashtags_df = pandas.DataFrame(tweets_no_rts_df, 
                                   columns=['tweet_hashtags'])
top_hashtags_df['tweet_hashtags'] = top_hashtags_df['tweet_hashtags'].apply(lambda x: ast.literal_eval(x))
top_hashtags_list = top_hashtags_df['tweet_hashtags'].tolist()
top_hashtags_list = list(itertools.chain.from_iterable(top_hashtags_list))
top_n_hashtags = Counter(top_hashtags_list).most_common(20)


In [1]:
# make dataframe of top hashtags to chart
#  top_hashtags_df = pandas.DataFrame(top_n_hashtags, columns=['hashtag', 'hashtag_count'])



## Tweet Media Types

In [114]:
from textblob import TextBlob
import re

def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analize_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [115]:
# get sentiment of english tweets
tweet_sentiment_analysis_df = tweets_no_rts_df[(tweets_no_rts_df.tweet_lang == 'en')]
tweet_sentiment_analysis_df = tweet_sentiment_analysis_df.reset_index(drop=True) 
tweet_sentiment_analysis_df['SentimentAnalysis'] = tweet_sentiment_analysis_df['tweet_text'].apply(analize_sentiment)


In [161]:
# print sentiment analysis values
total_values = len(tweet_sentiment_analysis_df.index)
sentiment_values = tweet_sentiment_analysis_df['SentimentAnalysis'].value_counts().to_dict()
positive_tweets = sentiment_values[1]
neutral_tweets = sentiment_values[0]
negative_tweets = sentiment_values[-1]
print("Percentage of positive tweets: {}%".format(round(positive_tweets*100/total_values)))
print("Percentage of neutral tweets: {}%".format(round(neutral_tweets*100/total_values)))
print("Percentage de negative tweets: {}%".format(round(negative_tweets*100/total_values)))


Percentage of positive tweets: 28%
Percentage of neutral tweets: 52%
Percentage de negative tweets: 20%


In [162]:
sentiment_values['Posivite'] = sentiment_values.pop(1)
sentiment_values['Neutral'] = sentiment_values.pop(0)
sentiment_values['Negative'] = sentiment_values.pop(-1)

For more accurate results I should consider adding the retweets back into my dataset, but dropping any duplicates. An interesting idea would be to analyze the polarity of the tweets from different media types. It might be deterministic that by only considering the tweets from a specific media type, the polarity would result more positive/negative.

## Next steps in my analysis