# Visualisation of Data streamed to MongoDB

## Imports

In [None]:
import re
import plotly
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *


# http://tweepy.readthedocs.io/en/v3.5.0/index.html
import tweepy
from tweepy import OAuthHandler
# https://pandasguide.readthedocs.io/en/latest/
import pandas as pd
# https://numpy.readthedocs.io/en/latest/
import numpy as np
# https://api.mongodb.com/python/current/
import pymongo

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

MONGO_URL = 'mongodb://twitter-mongodb:27017/'

## MongoDB
To gain access to the mongoDB the library `pymongo` is used.

In the first step the mongoDB URL is defined.

MONGO_URL = 'mongodb://twitter-mongodb:27017/'

In [None]:
def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False, criteria=None, projection=None):
    # Optionally, use criteria and projection to limit the data that is
    # returned - http://docs.mongodb.org/manual/reference/method/db.collection.find/
    
    # Connects to the MongoDB server running on
    client = pymongo.MongoClient(MONGO_URL)
    # Reference a particular collection in the database
    db = client[mongo_db]
    # Perform a bulk insert and return the IDs
    coll = db[mongo_db_coll]
    if criteria is None:
        criteria = {}
    if projection is None:
        cursor = coll.find(criteria)
    else:
        cursor = coll.find(criteria, projection)
    
    # Returning a cursor is recommended for large amounts of data
    if return_cursor:
        return cursor
    else:
        return [ item for item in cursor ]

## Get some tweets to work with

In [None]:
#load tweets from a mongo (trump db)
tweets = load_from_mongo('trump', 'tweets')
for t in tweets[:5]:
    print(t['text'])

## Creating a DataFrame with Pandas

In [None]:
# Create a pandas DataFrame out of the tweets
data = pd.DataFrame(data=[t['text'] for t in tweets], columns=['Tweets'])

# Diplay the first 5 elements of the DataFrame
display(data.head(5))

## Extend our DataFrame

In [None]:
# We add relevant data for Numpy Array:
data['len']  = np.array([len(t['text']) for t in tweets])
data['ID']   = np.array([t['id'] for t in tweets])
data['Date'] = np.array([t['created_at'] for t in tweets])
data['Source'] = np.array([''.join(re.findall('.*>(.*)<.*',t['source'])) for t in tweets])
data['Likes']  = np.array([t['favorite_count'] for t in tweets])
data['RTs']    = np.array([t['retweet_count'] for t in tweets])

display(data.head(10))



## Get average length of Tweets

In [None]:
# We extract the mean of lenghts:
mean = np.mean(data['len'])

print("The lenght's average in tweets: {}".format(mean))

## Get Tweets with the Most Likes and Retweets

In [None]:
# We extract the tweet with more FAVs and more RTs:

fav_max = np.max(data['Likes'])
rt_max  = np.max(data['RTs'])

fav = data[data.Likes == fav_max].index[0]
rt  = data[data.RTs == rt_max].index[0]

# Max FAVs:
print("The tweet with more likes is: \n{}".format(data['Tweets'][fav]))
print("Number of likes: {}".format(fav_max))
print("{} characters.\n".format(data['len'][fav]))

# Max RTs:
print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt]))
print("Number of retweets: {}".format(rt_max))
print("{} characters.\n".format(data['len'][rt]))

## Get Tweets with the Least Likes and Retweets

In [None]:
# We extract the tweet with more FAVs and more RTs:

fav_min = np.min(data['Likes'])
rt_min  = np.min(data['RTs'])

data_fav_min = data[data.Likes == fav_min].index[0]
data_rt_min  = data[data.RTs == rt_min].index[0]

# Max FAVs:
print("The tweet with the minimum likes is: \n{}".format(data['Tweets'][data_fav_min]))
print("Number of likes: {}".format(fav_min))
print("{} characters.\n".format(data['len'][data_fav_min]))

# Max RTs:
print("The tweet with minimum retweets is: \n{}".format(data['Tweets'][data_rt_min]))
print("Number of retweets: {}".format(rt_min))
print("{} characters.\n".format(data['len'][data_rt_min]))

## We create time series for data

In [None]:
# One-dimensional ndarray with axis labels (including time series)

tlen = pd.Series(data=data['len'].values, index=data['Date'])

# Lenghts along time:
tlen.plot(figsize=(14,10), color='r', label='Length of Tweets');

## Limit the Data used (Length of Tweets)

In [None]:
#Limit Data to first 100 because it would be overwhelming
data_oneHundred = data.head(100)

tlen = pd.Series(data=data_oneHundred['len'].values, index=data_oneHundred['Date'])

# Lenghts along time:
tlen.plot(figsize=(16,10), color='r', label='Length of Tweets');

## Likes and Retweets

In [None]:
tfav = pd.Series(data=data['Likes'].values, index=data['Date'])
tret = pd.Series(data=data['RTs'].values, index=data['Date'])

# Likes vs retweets visualization:
tfav.plot(figsize=(16,4), label="Likes", legend=True)
tret.plot(figsize=(16,4), label="Retweets", legend=True);

## Limit the data used (Likes and retweets)

In [None]:
tfav = pd.Series(data=data_oneHundred['Likes'].values, index=data_oneHundred['Date'])
tret = pd.Series(data=data_oneHundred['RTs'].values, index=data_oneHundred['Date'])

# Likes vs retweets visualization:
tfav.plot(figsize=(16,4), label="Likes", legend=True)
tret.plot(figsize=(16,4), label="Retweets", legend=True);

### Figure out (LIVE!) which tweet skyrocket on 3rd of January (Dates may vary)

In [None]:
likes_max = np.max(data_oneHundred['Likes'])
likes_data = data_oneHundred[data_oneHundred.Likes == likes_max].index[0]

print("The tweet with the maximum likes is: \n{}".format(data_oneHundred['Tweets'][likes_data]))
print("Number of likes: {}".format(likes_max))
print("{} characters.\n".format(data['len'][likes_data]))

## Go through all sources and list them (only unique Entry's)

In [None]:
import re

# We obtain all possible sources:
sources = []
for source in data['Source']:
    if source not in sources:
        sources.append(source)

# We print sources list:
print("Creation of content sources:")
for source in sources:
    print("* {}".format(source))

In [None]:
# We create a numpy vector mapped to labels:
percent = np.zeros(len(sources))

for source in data['Source']:
    for index in range(len(sources)):
        if source == sources[index]:
            percent[index] += 1
            pass

percent /= 100

# Pie chart:
pie_chart = pd.Series(percent, index=sources, name='Sources')
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6));