In [None]:
import config
import gzip
import lib_db
import lib_nlp
import datetime
import lib_vk
from dateutil.relativedelta import relativedelta

In [None]:
from_datetime = datetime.datetime(2017,7,25,0,0,0)

In [None]:
data = {}
import pickle
datafile = 'saved_data.pkl'
def save_data():
    global data
    with open(datafile, 'wb') as f:
        pickle.dump(data, f)
def load_data():
    global data
    with open(datafile, 'rb') as f:
        data = pickle.load(f)

In [None]:
load_data()

In [None]:
for name in data.keys():
    data[name]['total'] = {}
    for k in ['post_count', 'likes', 'reposts', 'comments', 'subscribers_count']:
        data[name]['total'][k] = 0
        for social in ['twitter', 'fb', 'vk']:
            data[name]['total'][k] += data[name].get(social, {}).get(k, 0)
    data[name]['sentiments_count'] = {'pos': 0, 'neg': 0}
    for c in data[name]['vk']['comments_objects']:
        if c['sentiment'] == 'pos':
            data[name]['sentiments_count']['pos'] += 1
        if c['sentiment'] == 'neg':
            data[name]['sentiments_count']['neg'] += 1

In [None]:
import pandas as pd
import matplotlib as mpl
# mpl.rc('font', family='Verdana') # Русский язык локально
mpl.rc('font', family='DejaVu Sans') # Русский язык на сервере
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
labels = 'ВКонтакте', 'Twitter', 'Facebook'
sizes = [215, 130, 245]
colors = ['gold', 'yellowgreen', 'lightskyblue']
# colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0.1, 0, 0)  # explode 1st slice
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Всего: 123 публикации')
plt.axis('equal')
plt.show()

In [None]:
import pandas as pd
from collections import defaultdict
def prepare_df(data, key_name):
    indexes = list(data.keys())
    vals = defaultdict(list)
    df_data = {}
    for social in ['twitter', 'fb', 'vk']:
        for name in data.keys():
            vals[social].append(data[name][social][key_name])
        df_data[social] = pd.Series(vals[social], index=indexes)
    df = pd.DataFrame(df_data)
    return df

In [None]:
df = prepare_df(data, 'post_count')
df.plot(kind='bar', stacked=True, figsize=(6, 5), title='Количество постов')

In [None]:
df['Всего'] = df.sum(axis=1)
df.sort_values('Всего', ascending=False)

In [None]:
df = prepare_df(data, 'subscribers_count')

In [None]:
df[df.index != 'bank'].plot(kind='bar', stacked=True, figsize=(6, 5), title='Количество подписчиков')

In [None]:
df['Всего'] = df.sum(axis=1)

In [None]:
df.sort_values('Всего', ascending=False)

In [None]:
from collections import defaultdict
def prepare_df(data):
    indexes = list(data.keys())
    vals = defaultdict(list)
    df_data = {}
    for k in ['likes', 'reposts', 'comments']:
        for name in data.keys():
            vals[k].append(100 * data[name]['total'][k] / data[name]['total']['post_count'] / data[name]['total']['subscribers_count'])
        df_data[k] = pd.Series(vals[k], index=indexes)
    df = pd.DataFrame(df_data)
    return df

In [None]:
df = prepare_df(data)

In [None]:
df[df.index != 'bank'].plot(kind='bar', stacked=True, figsize=(12, 4), title='Индекс Вовлеченности')

In [None]:
df['Всего'] = df.sum(axis=1)
df.sort_values('Всего', ascending=False)

In [None]:
from collections import defaultdict
def prepare_df(data):
    indexes = list(data.keys())
    vals = defaultdict(list)
    df_data = {}
    for social in ['twitter', 'fb', 'vk']:
        for name in data.keys():
            sum_reactions = 0
            for k in ['likes', 'reposts', 'comments']:
                sum_reactions += data[name][social].get(k, 0)
            vals[social].append(100 * sum_reactions / data[name][social]['post_count'] / data[name][social]['subscribers_count'])
        df_data[social] = pd.Series(vals[social], index=indexes)
    df = pd.DataFrame(df_data)
    return df

In [None]:
df = prepare_df(data)
df['Всего'] = df.mean(axis=1)
df.sort_values('Всего', ascending=False)

In [None]:
from math import sqrt

In [None]:
cnt = 0
def draw_scatter(data, social):
    points_x = []
    points_y = []
    names = []
    sizes = []
    sizes2 = []
    for name in data.keys():
        if name == 'bank':
            continue
        d = data[name][social]
        points_x.append(100000*d['reposts']/d['post_count']/d['subscribers_count'])
        points_y.append(100*(d['likes'] + d.get('comments', 0))/d['post_count']/d['subscribers_count'])
        names.append(name)
#         sizes.append(80*int(sqrt(int((d['post_count'] + d['reposts']*10 + d['likes'] + d.get('comments', 0)) / 10))))
        sizes.append(int(sqrt(d['subscribers_count'])) * 1)
#     plt.subplots_adjust(bottom = 0.1)
#     ax.scatter(points_x, points_y, marker='o', c=range(len(sizes)), s=sizes, cmap=plt.get_cmap('Spectral'))
#     ax.scatter(points_x, points_y, marker='o', c=[20]*(len(sizes)), s=sizes2, cmap=plt.get_cmap('Greys'))
    ax.scatter(points_x, points_y, marker='o', c=[0]*(len(sizes)), s=sizes, cmap=plt.get_cmap('Spectral'))
    ax.set_title(social)
    ax.set_xlabel("Индекс Виральности")
    ax.set_ylabel("Индекс Вовлечённости")
    for label, x, y in zip(names, points_x, points_y):
        ax.annotate(
            label,
            xy=(x, y), xytext=(-20, 20),
            textcoords='offset points', ha='right', va='bottom',
            bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
            arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
#     plt.subplots_adjust(left=3, right=4, top=3.5, bottom=2)

In [None]:
plt.figure()
fig = plt.figure(figsize=(12, 12))
indx = 1
for social in ['total', 'twitter', 'fb', 'vk']:
    ax = fig.add_subplot(2,2,indx) # one row, one column, first plot
    draw_scatter(data, social)
    indx += 1

In [None]:
import pandas as pd
from collections import defaultdict
def prepare_df(comments):
    indexes = []
    comments = sorted(comments, key=lambda x: x['date'], reverse=False)
    from_date_iter = from_datetime
    while from_date_iter < datetime.datetime.utcnow():
        c_date_str = from_date_iter.strftime('%Y-%m-%d')
        from_date_iter += relativedelta(days=1)
        indexes.append(c_date_str)           
    df_data = {}
#     for sent in ['neg', 'pos', 'neutral', 'unknown']:
    for sent in ['neg', 'pos']:
        vals_map = defaultdict(int)
        for c in comments:
            if c['sentiment'] == sent:
                c_date = datetime.datetime.fromtimestamp(int(c['date']))
                c_date_str = c_date.strftime('%Y-%m-%d')
                vals_map[c_date_str] += 1
        vals = []
        for k in indexes:
            vals.append(vals_map.get(k, 0))
        df_data[sent] = pd.Series(vals, index=indexes)
    df = pd.DataFrame(df_data)
    return df

In [None]:
plt.figure()
fig = plt.figure(figsize=(15, 15))
indx = 1
for name in data.keys():
#     ax = fig.add_subplot(1, len(data.keys()),indx) # one row, one column, first plot
    df = prepare_df(data[name]['vk']['comments_objects'])
    df.plot(kind='bar', stacked=True, figsize=(12, 5), title=name)
    indx += 1

In [None]:
import pandas as pd
from collections import defaultdict
def prepare_df(data):
    indexes = list(data.keys())
    vals = defaultdict(list)
    df_data = {}
    for sent in ['neg', 'pos']:
        for name in data.keys():
            total_com_count = data[name]['sentiments_count']['neg'] + data[name]['sentiments_count']['pos']
            vals[sent].append(data[name]['sentiments_count'][sent]/total_com_count)
        df_data[sent] = pd.Series(vals[sent], index=indexes)
    df = pd.DataFrame(df_data)
    return df

In [None]:
df = prepare_df(data)
df = df.sort_values('neg')
df.plot(kind='bar', stacked=True, figsize=(10, 4), title='')

In [None]:
cnt = 0
def draw_scatter(data):
    points_x = []
    points_y = []
    names = []
    sizes = []
    sizes2 = []
    for name in data.keys():
        d = data[name]['vk']
        sc = data[name]['sentiments_count']
        sent_x = (sc['pos'] - sc['neg']) / (sc['pos'] + sc['neg'])
        points_x.append(sent_x)
        points_y.append(d['comments'])
        names.append(name)
#         sizes.append(int((d['post_count'] + d['reposts']*10 + d['likes'] + d.get('comments', 0)) / 1))
        sizes.append(int((d['views'])/100))
        sizes2.append((d['post_count']) * 10)
#     plt.subplots_adjust(bottom = 0.1)
#     ax.scatter(points_x, points_y, marker='o', c=range(len(sizes)), s=sizes, cmap=plt.get_cmap('Spectral'))
    ax.scatter(points_x, points_y, marker='o', c=[20]*(len(sizes)), s=sizes, cmap=plt.get_cmap('Greys'))
    ax.scatter(points_x, points_y, marker='o', c=[0]*(len(sizes)), s=sizes2, cmap=plt.get_cmap('Spectral'))
    ax.set_title(social)
    ax.set_xlabel("тональность")
    ax.set_ylabel("комментарии")
    for label, x, y in zip(names, points_x, points_y):
        ax.annotate(
            label,
            xy=(x, y), xytext=(-20, 20),
            textcoords='offset points', ha='right', va='bottom',
            bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
            arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
#     plt.subplots_adjust(left=3, right=4, top=3.5, bottom=2)

In [None]:
plt.figure()
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(1,1,1) # one row, one column, first plot
draw_scatter(data)

In [None]:
len(data['bank']['vk']['comments_objects'])

In [None]:
from collections import Counter

In [None]:
top_obj = Counter()
obj_text_map = defaultdict(list)
for c in data['bank']['vk']['comments_objects'][:150]:
    try:
        unused, objs = ner.get_objects(lib_nlp.vk_remove_mention(c['text']), need_clean=False)
    except:
        continue
    for o in objs:
        obj_text_map[o].append(c)
    print(objs)
    top_obj.update(objs)

In [None]:
top_obj.most_common(30)

In [None]:
def smart_part(obj, text):
    text = text.replace(',', '.').lower()
    parts = text.split('.')
    for p in parts:
        if obj in p:
            p = p.strip()
            words = p.split(' ')
            for i in range(len(words)):
                w = words[i]
                if obj in w:
                    from_pos = max(0, i - 4)
                    to_pos = min(len(words), i + 4)
                    return ' '.join(words[from_pos:to_pos])
    return ''

In [None]:
for obj, cnt in top_obj.most_common(20):
    print(obj)
    good_cnt = 0
    for c in obj_text_map[obj][0:5]:
        if obj in c['text'].lower():
            short_txt = smart_part(obj, c['text'])
            if len(short_txt) > 0:
                good_cnt += 1
            print(short_txt)
        if good_cnt >= 4:
            break