In [1]:
import os
import pandas as pd
import json

In [2]:
import utils
import altair as alt


In [3]:
%load_ext autoreload

# Inspect one user only

In [4]:
def get_user_final_timeline(user, type_user):
    with open(os.path.join(".\data\{}".format(type_user), user+'.json'),'r') as json_file:
            timeline=pd.DataFrame.from_dict(json.load(json_file))
    final = utils.get_retweet_and_quoted(timeline)
    screen_names = final['user'].apply(lambda x: x['screen_name'])
    all_rt_users = final['user'].apply(lambda x: x['screen_name']).value_counts()
    relevant_outlayers = utils.relevant_outlayers(all_rt_users)
    final['outlayer']=screen_names.apply(lambda user: relevant_outlayers[user])
    return final

In [5]:
def plot_top_users_time(user, type_user='politicians'):
    final = get_user_final_timeline(user, type_user)
    all_users_x_month = alt.Chart(final,width=400).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3
    ).encode(
        x='yearmonth(created_at):O',
        y=alt.Y('count():Q'),#, sort=alt.SortField(field="count():Q", op="distinct", order='ascending')),
        color=alt.Color('user.screen_name:N',sort='-y'),
        order=alt.Order('count():Q')
    )
    outlayer_transparency = alt.Chart(final, width=400).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3,
        opacity=0.7,
        color='black'
    ).encode(
        x='yearmonth(created_at):O',
        y=alt.Y('count():Q'),#, sort=alt.SortField(field="count():Q", op="distinct", order='ascending')),
        opacity=alt.Opacity('outlayer:O',sort='-y', scale=alt.Scale(range=[0.35,0])),
        order='order:N'
    ).transform_calculate(
        order="if (datum.outlayer, 1, 0)"
    )
    return alt.layer(all_users_x_month,outlayer_transparency).resolve_scale(color='independent')

In [1]:
#plot_top_users_time('LauraBorras')

In [9]:
def plot_top_rt_and_quote(user,type_user='politicians'):
    final = get_user_final_timeline(user, type_user)
    return alt.Chart(final).mark_bar(
        cornerRadiusTopLeft=3,
        cornerRadiusTopRight=3
    ).encode(
        x=alt.X('user.screen_name:N', sort='-y'),
        y=alt.Y('count():Q'),
        color='outlayer'
    )

In [2]:
#plot_top_rt_and_quote('LauraBorras')

# General patterns

In [11]:
def create_dict(type_user='politicians'):
    all_users = {}
    for file in os.listdir(".\data\{}".format(type_user)):
        if file.endswith(".json"):
            key = file.split('.')[0]
            with open(os.path.join(".\data\{}".format(type_user), file),'r') as json_file:
                all_users[key]=pd.DataFrame.from_dict(json.load(json_file))
    return all_users

In [2]:
all_congress = create_dict()

In [1]:
#top10
#len(all_congress)

In [13]:
def normalized_top10(all_timelines, n_top=10):   
    dict_len_tweets = {}
    for k,v in all_timelines.items():
        dict_len_tweets[k]=len(v)
        if len(v)>0:
            rt_quoted = utils.get_retweet_and_quoted(v)
            top = rt_quoted['user'].apply(lambda x: x['screen_name']).value_counts()
            if n_top:
                top = top[:n_top]
            sum_top = sum(top)
            for k2,v2 in top.iteritems():            
                top[k2]=(v2/sum_top)
            dict_top[k]=list(top.values())
    top10_df = pd.DataFrame(dict_top10).transpose()
    #top10_df['total_tweets'] = top10_df.apply(lambda x: dict_len_tweets[x.name], axis=1)
    return top10

313

In [None]:
dict_value_counts = {k:utils.get_retweet_and_quoted(v) for k,v in all_congress.items() if len(v)>0}

In [59]:
dict_value_counts = {k:utils.get_retweet_and_quoted(v)['user'].apply(lambda x: x['screen_name']).value_counts() for k,v in all_congress.items() if len(v)>10}

In [65]:
dict_value_counts = {k:v['user'].apply(lambda x: x['screen_name']).value_counts() for k,v in dict_value_counts.items() if len(v)>0}

In [139]:
dict_top10 = {}
for k,v in dict_value_counts.items():
    if len(v)>10:
        sum_top10 = sum(v[:10])
        top10 = {}
        for k2,v2 in v[:10].iteritems():            
            top10[k2]=(v2/sum_top10)
        dict_top10[k]=list(top10.values())

In [140]:
top10_df = pd.DataFrame(dict_top10).transpose()

In [141]:
top10_df['total_tweets'] = top10_df.apply(lambda x: dict_len_tweets[x.name], axis=1)

In [175]:
top10_df['total_tweets_q'] = pd.qcut(top10_df['total_tweets'], 4, labels=False)

In [191]:
#len(top10_df[top10_df['total_tweets_q']==0])


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,total_tweets
total_tweets_q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.320239,0.202902,0.124196,0.085277,0.065936,0.053058,0.045218,0.038623,0.034167,0.030384,791.558442
1,0.295963,0.179198,0.122301,0.088963,0.071681,0.061176,0.051943,0.047278,0.042923,0.038573,2875.831169
2,0.320445,0.183452,0.123488,0.089843,0.069335,0.055415,0.047302,0.041155,0.036789,0.032776,3202.55
3,0.352164,0.181028,0.108864,0.083374,0.063942,0.055116,0.047372,0.041031,0.035365,0.031744,3228.972973


In [7]:
top10_df.reset_index().melt(id_vars=['index','total_tweets','total_tweets_q']).head()

In [None]:
alt.Chart(top10_df.reset_index().melt(id_vars=['index','total_tweets','total_tweets_q'])).mark_line().encode(
    x='variable',
    y='value',
    color='index',
    column=alt.Column('total_tweets_q:Q')
)

In [3]:
alt.Chart(top10_df.reset_index().melt(id_vars=['index','total_tweets'])).mark_bar(
opacity=0.7).encode(
    x='variable',
    y='value',
    color=alt.Color('total_tweets:Q', scale=alt.Scale(scheme='rainbow')),
    column=alt.Column('total_tweets:Q',  bin=True)
)

In [6]:
melted = top10_df.groupby('total_tweets_q').mean().drop(columns='total_tweets').reset_index().melt(id_vars=['total_tweets_q'])
alt.Chart(melted).mark_line().encode(
    x='variable',
    y='value',
    color='total_tweets_q'
)

In [None]:
#

In [None]:
# con y sin unos
# distribucion de outliers
# clustering distribucion

# outliers vs tabla
# timesspan 
# clustering tabla or PCA



# correlaciones
# random -> palabras