In [1]:
import sys
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import json
import os
sys.path.append("/Users/arankhanna/Dropbox/Dolores/Iverson")

In [2]:
from common.file_system.s3 import S3
fs = S3()

class ChatHistoryTracker(object):#CandleStickTracker):
    """An ecapsulation of any group chat history

    Attributes:
        service: A string representing the service vending the chat.
    """

    def __init__(self, service='telegram', balance=0.0):
        """Return a Service"""
        self.service = service
        self.msgs = []
        self.alerts = []
        self.usrs = set()

        
    def initalize(self, chat_name='officialios'):
        service = self.service
        names = fs.list_files('data/'+service+'/'+chat_name)
        tot_files = len(names)
        
#         print(f"Pulling {tot_files} history files from server please wait...")
        print("Pulling "+str(tot_files)+" history files from server please wait...")

        for n in names:
            fs.download_file(n, 'dumps/')
            filen = n.split('/')[-1]
            with open('dumps/'+filen, 'r') as infile:
                chat_history = json.loads(infile.read())
                for message in chat_history:
                    if message['_'] == 'Message':
                        self.msgs.append((message['message'], message['from_id'],  datetime.utcfromtimestamp(message['date'])))
                        self.usrs.add(message['from_id'])
                    elif message['_'] == 'MessageService':
                        self.alerts.append((message['action'], datetime.utcfromtimestamp(message['date'])))
                    else:
                        print(message)
                        raise("Unexpected Message Type")
            os.remove('dumps/'+filen)
        
        tot = len(self.msgs)
#         first = self.msgs[0][2]
#         last = self.msgs[-1][2]
#         print(f"{tot} total messages from {first:%B %d, %Y} to {last:%B %d, %Y}")
        first = self.msgs[0][2].strftime("%Y-%m-%d")
        last = self.msgs[-1][2].strftime("%Y-%m-%d")
        print(str(tot)+" total messages from "+first+" to "+last)

In [3]:
from IPython.display import display, display_javascript
from IPython.display import clear_output
import ipywidgets as widgets
from IPython.display import Javascript
from IPython.core.display import display, HTML

# This shit should always run for dashboards
display(HTML("<style>.container { width:100% !important; }</style>"))

# DECLARE ALL INITIAL TRACKERS
ct = ChatHistoryTracker('telegram')

# State for Dashboard
done = False
widget_ids = []

# Hi I'm a widget message
display(HTML("<h1>Welcome to the IV Lab Dashboard!</h1>\
<h2>This area titled 'Hidden Cells' is where the available Wizards outside of your workspace live<h2/>\
<h3>To move these up into your workspace hover over and click the button in the top right</h3>\
<h4>And to move widgets down from your workspace above hover over and click X the button in the top right</h4>"))

In [4]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
MASTER WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''
from ipywidgets import IntSlider, Text, VBox
import time

assets = widgets.Dropdown(options = ['officialios'])

# selected_start = widgets.DatePicker(
#     description='From',
#     disabled=False
# )

# selected_end = widgets.DatePicker(
#     description='To',
#     disabled=False
# )

refresh_button = widgets.Button(description="Refresh Data")


def refresh(b):
    product = assets.value
    
    # INIT TRACKERS    
    ct.initalize(product)# ct.initalize(self, chat_name='officialios')    

print("Tracker Control Panel")

display(assets)
display(refresh_button)
refresh_button.on_click(refresh)

Tracker Control Panel


Dropdown(options=('officialios',), value='officialios')

Button(description='Refresh Data', style=ButtonStyle())

In [5]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
USER GROWTH WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''
from highcharts import Highstock
import pandas as pd


def display_user_growth_wizard():
    
    growth_button = widgets.Button(description="Get Growth")
        
    vbox = VBox([growth_button])
    menu = widgets.Accordion(children=[vbox])
    menu.set_title(0, 'Growth Selector')

    def get_growth(b):

        # reset view
        clear_output()
        display(menu)
        
        # Get chat participants joing and leaving
        num_users = 1
        user_timeline = {'time':[], 'users':[]}
        user_events = {'time':[], 'users':[]}

        for a in ct.alerts:
            l = a[0]
            delta = 0
            if l['_'] == 'MessageActionChatDeleteUser':
                delta = -1
            elif l['_'] == 'MessageActionChatAddUser':
                delta = 1
            num_users += delta

            user_timeline['time'].append(a[1])
            user_timeline['users'].append(num_users)

            user_events['time'].append(a[1])
            user_events['users'].append(delta)


        # Plot chat size
        df = pd.DataFrame(user_timeline)
        #df = pd.DataFrame(user_events)
        usr_timeline = df.set_index(pd.DatetimeIndex(df['time']))
        usr_timeline.drop('time', axis=1, inplace=True)



        usr_candles = usr_timeline['users'].resample('24H').ohlc()

        chart_name = 'Telegram User Growth'

        chart = Highstock()
        options = {
            'title': {
                'text': chart_name
            },
            'rangeSelector': {
                'selected': 4
            },
            'yAxis': [{
                'labels': {
                    'align': 'right',
                    'x': -3
                },
                'title': {
                    'text': 'OHLC'
                },
                'height': '60%',
                'lineWidth': 2
            }]
        }

        pd_sticks = usr_candles

        o = pd_sticks['open'].as_matrix()
        h = pd_sticks['high'].as_matrix()
        l = pd_sticks['low'].as_matrix()
        c = pd_sticks['close'].as_matrix()

        idx =  pd_sticks.index.tolist()

        ohlc = []
        for i in range(len(idx)):
            ohlc.append(
                [
                    idx[i], # the date
                    o[i], # open
                    h[i], # high
                    l[i], # low
                    c[i]  # close
                ]
                )


        chart.add_data_set(ohlc, 'candlestick', 'users')
        chart.set_dict_options(options)


        display(chart)

        
        
    # Wire up controls
    growth_button.on_click(get_growth)

    # reset view
    display(menu)

# display_user_growth_wizard()

In [6]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
MESSAGES WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''

from highcharts import Highchart

def display_messages_wizard():
    
    messages_button = widgets.Button(description="Get Messages")
        
    vbox = VBox([messages_button])
    menu = widgets.Accordion(children=[vbox])
    menu.set_title(0, 'Message Selector')

    def get_messages(b):
        
        # reset view
        clear_output()
        display(menu)
        
        messages = {'message':[], 'user':[], 'time':[]}

        for m in ct.msgs:
            messages['time'].append(m[2])
            messages['user'].append(m[1])
            messages['message'].append(m[0])

        # Plot chat size
        df = pd.DataFrame(messages)
        msg_timeline = df.set_index(pd.DatetimeIndex(df['time']))
        msg_timeline.drop('time', axis=1, inplace=True)


        # Msg counts
        msg_counts = msg_timeline['message'].resample('24H').count()

        chart = Highchart()


        chart_name = 'Messages Per Day'
        options = {
            'title': {
                'text': chart_name
            },
            'plotOptions':{'line': {
                        'dataLabels': {
                            'enabled': True
                            }}},
            'chart': {
                'zoomType': 'x'
            },
            'xAxis': { 'type': 'datetime' }
        }


        p = msg_counts.as_matrix()
        idx =  msg_counts.index.tolist()

        hc_series = []
        for j in range(len(idx)):
            hc_series.append([
                idx[j],
                int(p[j])
            ])


        chart.add_data_set(hc_series, 'line', 'Messages')
        chart.set_dict_options(options)


        display(chart)


    # Wire up controls
    messages_button.on_click(get_messages)

    # reset view
    display(menu)

# display_messages_wizard()

In [7]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
LDA WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()
projection_types = ['pca', 'tsne', 'mmds']

def display_lda_wizard():

    projection_selector = widgets.Select(
        options=projection_types,
        description='Projection',
        disabled=False
    )
    
    
    lda_start = widgets.DatePicker(
        description='Start Date',
        disabled=False
    )

    lda_end = widgets.DatePicker(
        description='End Date',
        disabled=False
    )
    
    lda_button = widgets.Button(description="Get LDA")
    
    vbox = VBox([projection_selector, lda_start, lda_end, lda_button])
    menu = widgets.Accordion(children=[vbox])
    menu.set_title(0, 'Argument Selector')
    
    def get_lda(b):
        
        # reset view
        clear_output()
        display(menu)
        
        first = datetime.fromordinal(lda_start.value.toordinal())
        last= datetime.fromordinal(lda_end.value.toordinal())

        mds = projection_selector.value
        
        docs_raw = []
        for msg in ct.msgs:
            if msg[2] >= first and msg[2] <= last:
                docs_raw.append(msg[0])
        
        tot = len(docs_raw)
        
#         print(f"Running LDA on {tot} messages from {first:%B %d, %Y} to {last:%B %d, %Y}\nthis could take a while, leave the window open...")
        
        first_str = first.strftime("%Y-%m-%d")
        last_str = last.strftime("%Y-%m-%d")
        print("Running LDA on "+str(tot)+" messages from "+first_str+" to "+last_str+"\n this could take a while, leave the window open...")

        
        tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                        stop_words = 'english',
                                        lowercase = True,
                                        token_pattern = r'\b[a-zA-Z]{3,}\b',
                                        max_df = 0.5, 
                                        min_df = 10)
        dtm_tf = tf_vectorizer.fit_transform(docs_raw)
        #print(dtm_tf.shape)
        tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
        dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
        #print(dtm_tfidf.shape)

        # for TF DTM
        lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
        lda_tf.fit(dtm_tf)
        # for TFIDF DTM
        lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
        lda_tfidf.fit(dtm_tfidf)
        if mds in ['mmds', 'tsne']:
            output = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds=mds)
        else:
            output = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer) #pca
        
        display(output)
        display(HTML("<style>.container { width:100% !important; }</style>"))
    
    # Wire up controls
    lda_button.on_click(get_lda)
    
    # reset view
    display(menu)

#display_lda_wizard()

In [8]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
TEXT SEARCH WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''

def display_search_wizard():

    search_text = Text("")

    search_start = widgets.DatePicker(
        description='Start Date',
        disabled=False
    )

    search_end = widgets.DatePicker(
        description='End Date',
        disabled=False
    )
    
    
    search_button = widgets.Button(description="Search")
    
    vbox = VBox([search_text, search_start, search_end, search_button])
    menu = widgets.Accordion(children=[vbox])
    menu.set_title(0, 'Search Bar')
    

    def get_search(b):
        # Reset view
        clear_output()
        display(menu)

        # Search
        first = datetime.fromordinal(search_start.value.toordinal())
        last = datetime.fromordinal(search_end.value.toordinal())
        search = search_text.value

        messages = {'message':[], 'user':[], 'time':[]}

        for m in ct.msgs:
            messages['time'].append(m[2])
            messages['user'].append(m[1])
            messages['message'].append(m[0])


        # Plot chat size
        df = pd.DataFrame(messages)
        msg_timeline = df.set_index(pd.DatetimeIndex(df['time']))
        msg_timeline.drop('time', axis=1, inplace=True)

        pd.options.display.max_colwidth = 100
        mask = msg_timeline['message'].str.contains(search)   
        display(HTML(msg_timeline[first:last][mask[first:last]].to_html()))
        
    # Wire up controls
    search_button.on_click(get_search)
    
    # Reset view
    display(menu)
    
#display_search_wizard()

In [9]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
WORDCLOUD WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''
import matplotlib.pyplot as plt
from os import path
# from wordcloud import WordCloud


def display_wordcloud_wizard():
    
    
    cloud_start = widgets.DatePicker(
        description='Start Date',
        disabled=False
    )

    cloud_end = widgets.DatePicker(
        description='End Date',
        disabled=False
    )
    
    cloud_button = widgets.Button(description="Get Cloud")
    
    vbox = VBox([cloud_start, cloud_end, cloud_button])
    menu = widgets.Accordion(children=[vbox])
    menu.set_title(0, 'Argument Selector')
    
    def get_cloud(b):
        
        # reset view
        clear_output()
        display(menu)
        
        first = datetime.fromordinal(cloud_start.value.toordinal())
        last= datetime.fromordinal(cloud_end.value.toordinal())

        docs_raw = []
        for msg in ct.msgs:
            if msg[2] >= first and msg[2] <= last:
                docs_raw.append(msg[0])
        
        tot = len(docs_raw)
        
#         print(f"Generating Wordcloud on {tot} messages from {first:%B %d, %Y} to {last:%B %d, %Y}\nthis could take a while, leave the window open...")
        
        first_str = first.strftime("%Y-%m-%d")
        last_str = last.strftime("%Y-%m-%d")
        print("Generating Wordcloud on "+str(tot)+" messages from "+first_str+" to "+last_str+"\n this could take a while, leave the window open...")

        text = ' '.join(docs_raw)
        wordcloud = WordCloud(max_font_size=40).generate(text)
        plt.rcParams["figure.figsize"] = (20,7)
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()
    
    # Wire up controls
    cloud_button.on_click(get_cloud)
    
    # reset view
    display(menu)

# display_wordcloud_wizard()

In [10]:
if done:
    display_user_growth_wizard()

In [11]:
if done:
    display_messages_wizard()

In [12]:
if done:
    display_lda_wizard()

In [13]:
if done:
    display_search_wizard()

In [14]:
if done:
    display_wordcloud_wizard()

In [15]:
''' 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
WIDGET WIDGET 
xxxxxxxxxxxxxx
xxxxxxxxxxxxxx
'''

this_cell = 15 # This should be the last UI cell so use this ID specifically
widget_map = {
    'users':9,
    'messages':10,
    'topics':11,
    'search':10,
    'wordcloud':13
}


widget_types = widgets.Select(
    options=widget_map.keys(),
    description='Wizards',
    disabled=False
)

def add_wizard(x):
    global widget_ids
    widget_id = widget_map[widget_types.value]
    
    # JS to add and execute a cell w/ certain ID
    js = '''
    var cell_id = %s;
    var all_cells =Jupyter.notebook.get_cells();
    var cells = [all_cells[cell_id]]
    Jupyter.notebook.clipboard = [];
    var cell_json;
    for (var i=0; i < cells.length; i++) {
        cell_json = cells[i].toJSON();
        if (cell_json.metadata.deletable !== undefined) {
            delete cell_json.metadata.deletable;
        }
        Jupyter.notebook.clipboard.push(cell_json);
    }
    Jupyter.notebook.enable_paste();

    // Paste
    if (Jupyter.notebook.clipboard !== null && Jupyter.notebook.paste_enabled) {
        var first_inserted = null;
        for (var i = Jupyter.notebook.clipboard.length-1; i >= 0; i--) {
            var cell_data = Jupyter.notebook.clipboard[i];
            var new_cell = Jupyter.notebook.insert_cell_below(cell_data.cell_type);
            new_cell.fromJSON(cell_data);
            if (first_inserted === null) {
                first_inserted = new_cell;
            }
        }
        first_inserted.focus_cell();
    }

    // Execute
    Jupyter.notebook.execute_cell();
    $('button[data-dashboard-state="notebook"]').click();
    $('button[data-dashboard-state="grid"]').click();
    '''%(str(widget_id))
    
    display_javascript(Javascript(js))
    
    # Make sure we update widget id list
    widget_ids.append((widget_types.value, len(widget_ids)+this_cell))
    
def remove_wizards(x):
    global widget_ids
    id_list = [i[1] for i in widget_ids]
    
    # JS to add and execute a cell w/ certain ID
    js = '''Jupyter.notebook.delete_cells(%s)'''%(str(id_list))
    
    display_javascript(Javascript(js))
    widget_ids = []

    
add_widget_button = widgets.Button(description="New Wizard")
add_widget_button.on_click(add_wizard)
remove_widget_button = widgets.Button(description="Clear Wizards")
remove_widget_button.on_click(remove_wizards)

display(widget_types)
display(add_widget_button)
display(remove_widget_button)

Select(description='Wizards', options=('users', 'messages', 'topics', 'search', 'wordcloud'), value='users')

Button(description='New Wizard', style=ButtonStyle())

Button(description='Clear Wizards', style=ButtonStyle())

In [16]:
done = True
js = '''$('button[data-dashboard-state="grid"]').click();'''
display_javascript(Javascript(js))

In [17]:
# # Get senders + frequency
# senders = {}
# for msg in ct.msgs:
#     # Agg sender
#     if msg[1] in senders.keys():
#         senders[msg[1]] += 1
#     else:
#         senders[msg[1]] = 1
# print(len(senders.keys()))

In [18]:
# # Plot frequency
# import seaborn as sns
# import matplotlib.pyplot as plt

# # f, ax = plt.subplots(figsize=(7, 7))
# # ax.set(xscale="log", yscale="log")
# # sns.distplot(list(senders.values()))
# # plt.show()

# plt.hist(list(senders.values()), bins=np.linspace(10, 100))
# #np.logspace(np.log10(1.0),np.log10(100.0), 50))
# plt.gca()
# plt.show()