# EDA

## Setup dependencies and config

In [1]:
import datetime
import matplotlib.pyplot as plt
import altair as alt

import numpy as np
from collections import Counter
import json
from pathlib import Path
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Read data and transform

In [2]:
data = []
for p in tqdm(list(Path().glob('data/*.json')), desc='Load data'):
#     print(p)
    d = json.loads(p.read_text())
    data.extend(d['results'])

data_intreg = []

df_scraped_at, df_playlist, df_id, df_title, df_channel, df_duration, df_rank = [], [], [], [], [], [], []
for d in tqdm(data, desc='Transform data'):
    data_intreg.append([d['result']['fields']['videoCount'], len(d['result']['fields']['videos'])])
    
    for i, r in enumerate(d['result']['fields']['videos']):
        df_id.append(r['id'])
        # append channel + id to simplify analysis later on
        df_title.append(r['channelName'] + ' - ' + r['title'] + ' - ' + r['id'])
        df_channel.append(r['channelName'])
        df_duration.append(r['duration'])
        df_rank.append(i)
        
        df_scraped_at.append(d['scrapedAt'])
        df_playlist.append(d['label'])
df=pd.DataFrame(data={'scrapedAt':df_scraped_at, 'playlist': df_playlist, 'id': df_id, 'title': df_title, 'channel': df_channel, 'duration': df_duration, 'rank': df_rank})
df['scrapedAt'] = pd.to_datetime(df['scrapedAt'])
df['hour'] = df['scrapedAt'].dt.floor('h')
df['day'] = df['scrapedAt'].dt.floor('d')

if not Path('df.pkl').is_file():
    df.to_pickle('df.pkl')

Load data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:03<00:00, 15.11it/s]
Transform data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82890/82890 [00:01<00:00, 74858.77it/s]


### Check for data integriety

In [3]:
diff_counter = Counter([x - y for [x, y] in data_intreg])
diff_counter

Counter({1: 13384, 0: 68683, 2: 784, 3: 39})

In [4]:
diff_counter = Counter([x for [x, y] in data_intreg if x != y])
diff_counter

Counter({16: 12506,
         18: 160,
         17: 888,
         15: 590,
         12: 14,
         11: 13,
         14: 27,
         13: 7,
         6: 2})

There is some data missing when the playlist contain 16+ videos. Most likely because of lazy loading. This should however now affect the results.

## Some Statistics

In [5]:
min_day = df['day'].min()
max_day = df['day'].max()
diff_days = (max_day-min_day).days
diff_days

48

In [6]:
min_day

Timestamp('2021-10-29 00:00:00+0000', tz='UTC')

In [7]:
max_day

Timestamp('2021-12-16 00:00:00+0000', tz='UTC')

In [8]:
df_news = df[df['playlist'] == df.iloc[0]['playlist']]

In [9]:
dif_minutes= (df_news['scrapedAt'].max() - df_news['scrapedAt'].min()).seconds / 60

In [10]:
dif_minutes

1411.0

In [11]:
len(df_news['scrapedAt'].unique()) / dif_minutes

7.3437278525868175

How often was data collected? 7.3... times per hour. 

In [12]:
60 / 7.3437278525868175

8.170237405906196

Roughly every 8 minutes

## Analysis

### Plot counts for channels

In [13]:
def plot_counts(df, col, title, max_bars=None):
    counts = df[col].value_counts().reset_index().rename(columns={col:'counts', 'index': col})
    
    if max_bars is not None:
        counts=counts[:max_bars]
    
    chart = alt.Chart(counts).mark_bar().encode(
        x='counts',
        y=alt.Y(col + ':N', sort='-x',  axis=alt.Axis(labelLimit=500))
    ).properties(height=700, title=title)
    return chart

In [14]:
plot_counts(df, 'channel', 'Counts for all playlists')

In [47]:
counts_perc = df['channel'].value_counts().reset_index()

In [48]:
counts_perc['channel'] /= (df.shape[0] * 0.01)

In [49]:
print(counts_perc[:15].to_csv())

,index,channel
0,WELT Nachrichtensender,26.53799610999789
1,Handelsblatt,8.441647518776364
2,faz,7.264113505754165
3,BILD,7.222102462851459
4,AFP Deutschland,6.5517629940278175
5,tagesschau,6.02644123598203
6,SPORT BILD,5.836595415284671
7,DER AKTIONÄR TV,3.4638901000916156
8,BR24,3.2744117229181873
9,DW Deutsch,2.6586988579855273
10,phoenix,1.9148706843758114
11,WDR aktuell,1.672602920931053
12,ZDFheute Nachrichten,1.3265936025632858
13,DER SPIEGEL,1.2833577479141456
14,hessenschau,1.2439188096789522



In [16]:
df[df['channel'] == 'WELT Nachrichtensender'].shape[0] / df.shape[0]

0.2653799610999789

In [17]:
df[df['channel'] == 'WELT Nachrichtensender'].shape[0] / df.shape[0]

0.2653799610999789

In [18]:
playlists = df['playlist'].value_counts().keys()

In [19]:
playlist_charts = [plot_counts(df[df['playlist'] == p], 'channel', f"Counts for {p}") for p in playlists]

In [20]:
alt.hconcat(*playlist_charts)

### How often does a video stay in the playlists?

In [21]:
plot_counts(df, 'title', 'Counts for video id in all playlists', max_bars=50)

In [22]:
df['id'].value_counts()[:10]

POe2fMzceG0    9010
1dA5NSLqylU    7693
ViFXbAdMxZ8    6029
Eh48Xm8ISEA    5418
wF-6Etm27i4    4425
AR5mjnYpsIo    3893
FY-QO16YuAo    3828
zI2cjcWdpwU    3746
8tChnY_i4wI    3724
CeWXEMPLXNw    3508
Name: id, dtype: int64

In [23]:
playlist_charts_id = [plot_counts(df[df['playlist'] == p], 'title', f"Video counts for {p}", max_bars=50) for p in playlists]

In [24]:
alt.hconcat(*playlist_charts_id)

### Details about the most listed video

In [25]:
cleaned = df[df['id'] == 'POe2fMzceG0'].drop_duplicates(subset=['rank', 'day']).sort_values('scrapedAt')

In [26]:
cleaned

Unnamed: 0,scrapedAt,playlist,id,title,channel,duration,rank,hour,day
711966,2021-10-30 11:08:56.645000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,8,2021-10-30 11:00:00+00:00,2021-10-30 00:00:00+00:00
713217,2021-10-30 12:50:29.404000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,7,2021-10-30 12:00:00+00:00,2021-10-30 00:00:00+00:00
715771,2021-10-30 16:08:52.273000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,1,2021-10-30 16:00:00+00:00,2021-10-30 00:00:00+00:00
716120,2021-10-30 16:36:25.046000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,6,2021-10-30 16:00:00+00:00,2021-10-30 00:00:00+00:00
718653,2021-10-30 19:57:38.784000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,2,2021-10-30 19:00:00+00:00,2021-10-30 00:00:00+00:00
721665,2021-10-31 00:00:46.763000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,8,2021-10-31 00:00:00+00:00,2021-10-31 00:00:00+00:00
721988,2021-10-31 00:30:29.511000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,7,2021-10-31 00:00:00+00:00,2021-10-31 00:00:00+00:00
732922,2021-10-31 15:30:22.317000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,6,2021-10-31 15:00:00+00:00,2021-10-31 00:00:00+00:00
733156,2021-10-31 15:51:23.767000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,0,2021-10-31 15:00:00+00:00,2021-10-31 00:00:00+00:00
735284,2021-10-31 18:51:09.790000+00:00,Entertainment News / Unterhaltungsnachrichten ...,POe2fMzceG0,BILD - Bushido und Anna-Maria schonungslos ehr...,BILD,3124000,2,2021-10-31 18:00:00+00:00,2021-10-31 00:00:00+00:00


### Distribution over the days how ofte a video was listed

In [27]:
most_recommended_videos = list(df['title'].value_counts()[:100].keys())

In [28]:
def check_day(i):
    theday = min_day + datetime.timedelta(days=i)
    df_sub = df[df['day'] == theday]
    counts = [df_sub[df_sub['title'] == v].shape[0] for v in most_recommended_videos]
    return counts

In [29]:
x, y = np.meshgrid(range(0, diff_days), most_recommended_videos)
z = [check_day(i) for i in range(0, diff_days)]
# reshape & convert to np
z = np.array(list(zip(*z)))

In [30]:
# Convert this grid to columnar data expected by Altair
source = pd.DataFrame({'x': x.ravel(),
                     'y': y.ravel(),
                     'z': z.ravel()})

alt.Chart(source).mark_rect().encode(
    x='x:O',
    y=alt.Y('y:N',sort=None, axis=alt.Axis(labelLimit=500)),
    color='z:Q'
)

### Which videos were listed in more than one playlists?



In [31]:
df_stage1 = df.groupby(['title', 'playlist']).size().reset_index().rename(columns={0: 'count'})

In [32]:
df_stage1

Unnamed: 0,title,playlist,count
0,1. FC Kaiserslautern - FCK-Adventskalender 202...,Science and Technology News / Meldungen aus Wi...,31
1,1. FC Kaiserslautern - Pressekonferenz nach de...,Sports News / Sportnachrichten / PL4Yp_5ExVAU2...,109
2,1. FC Kaiserslautern - Pressekonferenz nach de...,Sports News / Sportnachrichten / PL4Yp_5ExVAU2...,401
3,1. FC Kaiserslautern - Pressekonferenz nach de...,Sports News / Sportnachrichten / PL4Yp_5ExVAU2...,352
4,1. FC Kaiserslautern - Pressekonferenz vor dem...,Sports News / Sportnachrichten / PL4Yp_5ExVAU2...,322
...,...,...,...
6273,"wetternet - Das ""Beast from the East"" lauert z...",National News / Nationale Nachrichten / PLNjtp...,23
6274,"wetternet - Das ""Beast from the East"" lauert z...",News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,3
6275,wetternet - Geomagnetischer Sturm rast auf die...,National News / Nationale Nachrichten / PLNjtp...,4
6276,wetternet - Heftig: Eiskalte Januarprognose vo...,National News / Nationale Nachrichten / PLNjtp...,33


In [33]:
# In four different playlists

In [34]:
for group_name, df_group in df_stage1.groupby('title'):
    if df_group.shape[0] > 3:
        print(group_name)
        print(df_group)
        print()

BR24 - Corona-Lage: Ruf nach strengen Regeln wird lauter | BR24 Rundschau - 1UPr6iEABig
                                                  title                                           playlist  count
1261  BR24 - Corona-Lage: Ruf nach strengen Regeln w...  National News / Nationale Nachrichten / PLNjtp...    242
1262  BR24 - Corona-Lage: Ruf nach strengen Regeln w...  News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...    237
1263  BR24 - Corona-Lage: Ruf nach strengen Regeln w...  Sports News / Sportnachrichten / PL4Yp_5ExVAU2...     75
1264  BR24 - Corona-Lage: Ruf nach strengen Regeln w...  World News / Internationale Nachrichten / PLr1...    273

WELT Nachrichtensender - CORONA IN DER BUNDESLIGA: Bayern-Profi Joshua Kimmich will sich nun doch impfen lassen | Eilmeldung - lBOMv9Hg7sM
                                                  title                                           playlist  count
3200  WELT Nachrichtensender - CORONA IN DER BUNDESL...  National News / Nationale Nachr

In [35]:
# in three different playlists and in total over 1000 times listed
for group_name, df_group in df_stage1.groupby('title'):
    sum_counts = df_group['count'].sum()
    if df_group.shape[0] == 3 and sum_counts > 1000:
        print(group_name)
        print(df_group)
        print()

AFP Deutschland - Biontech-Chef für Corona-Booster nach drei Monaten | AFP - jXeYfK6J5Ao
                                                 title                                           playlist  count
132  AFP Deutschland - Biontech-Chef für Corona-Boo...  National News / Nationale Nachrichten / PLNjtp...    550
133  AFP Deutschland - Biontech-Chef für Corona-Boo...  News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...    706
134  AFP Deutschland - Biontech-Chef für Corona-Boo...  World News / Internationale Nachrichten / PLr1...    911

AFP Deutschland - Corona-Schnelltests ab sofort wieder kostenlos | AFP - zjQspMrTXTI
                                                 title                                           playlist  count
205  AFP Deutschland - Corona-Schnelltests ab sofor...  National News / Nationale Nachrichten / PLNjtp...    454
206  AFP Deutschland - Corona-Schnelltests ab sofor...  News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...    397
207  AFP Deutschland - Corona-Schn

In [36]:
df

Unnamed: 0,scrapedAt,playlist,id,title,channel,duration,rank,hour,day
0,2021-11-12 23:00:20.735000+00:00,News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,gZD4hH8l3AA,AFP Deutschland - Niederlande kündigen Lockdow...,AFP Deutschland,45000,0,2021-11-12 23:00:00+00:00,2021-11-12 00:00:00+00:00
1,2021-11-12 23:00:20.735000+00:00,News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,YBZ_GmnA7e8,Handelsblatt - Klimagipfel neigt sich dem Ende...,Handelsblatt,116000,1,2021-11-12 23:00:00+00:00,2021-11-12 00:00:00+00:00
2,2021-11-12 23:00:20.735000+00:00,News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,erJgwCwk384,BILD - Putin schickt Atombomber nach Belarus –...,BILD,102000,2,2021-11-12 23:00:00+00:00,2021-11-12 00:00:00+00:00
3,2021-11-12 23:00:20.735000+00:00,News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,z0lbTAewCjM,münchen.tv - Das sagen die Münchner Wirte zur ...,münchen.tv,126000,3,2021-11-12 23:00:00+00:00,2021-11-12 00:00:00+00:00
4,2021-11-12 23:00:20.735000+00:00,News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q...,OEL1lYd5aKE,WELT Nachrichtensender - AKTUELLE CORONA-SCHOC...,WELT Nachrichtensender,59000,4,2021-11-12 23:00:00+00:00,2021-11-12 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...
816447,2021-12-16 15:44:30.555000+00:00,Health News / Gesundheitsnachrichten / PLG3wws...,XUsX8MUlvjQ,DW Deutsch - 40 Jahre HIV/AIDS und noch immer ...,DW Deutsch,438000,2,2021-12-16 15:00:00+00:00,2021-12-16 00:00:00+00:00
816448,2021-12-16 15:44:30.555000+00:00,Health News / Gesundheitsnachrichten / PLG3wws...,mvUH-zJYHxA,tagesschau - Weltaidstag: Corona-Pandemie wirf...,tagesschau,566000,3,2021-12-16 15:00:00+00:00,2021-12-16 00:00:00+00:00
816449,2021-12-16 15:44:30.555000+00:00,Health News / Gesundheitsnachrichten / PLG3wws...,4v3nL7gBshA,faz - Videografik: So attackiert HIV das mensc...,faz,77000,4,2021-12-16 15:00:00+00:00,2021-12-16 00:00:00+00:00
816450,2021-12-16 15:44:30.555000+00:00,Health News / Gesundheitsnachrichten / PLG3wws...,CeWXEMPLXNw,DER SPIEGEL - Recherche auf Corona-Intensivsta...,DER SPIEGEL,340000,5,2021-12-16 15:00:00+00:00,2021-12-16 00:00:00+00:00


In [37]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [38]:
import statistics

In [39]:
for playlist, df_group in df.groupby('playlist'):
    prev_titles = None

    same = 0
    notsame = 0
    dists = []

    idx = 0
    for group_name, df_group2 in df_group.sort_values(by=['scrapedAt']).groupby('scrapedAt'):
        title_list = df_group2['title'].tolist()

        if not prev_titles is None:
            if title_list == prev_titles:
                same += 1
            else:
                notsame += 1
            dists.append(jaccard_similarity(title_list, prev_titles))

        prev_titles = title_list
        # if idx > 20:
        #     break
        # idx += 1
        # print(group_name)
        # print(df_group)
    print(playlist)
    print(same)
    print(notsame)
    print(same / notsame)
    print(sum(dists) / len(dists))
    print(statistics.mean(dists))
    print()

Business News / Wirtschaftsmeldungen / PLQ3HMgwndlsXYLmN7jEl7JEnjmttNIPjP
505
9855
0.05124302384576357
0.9918456457742129
0.9918456457742172

Entertainment News / Unterhaltungsnachrichten / PLivYonEKHnxz0k2KP8IybytVIykYJK6R4
851
9509
0.08949416342412451
0.9904886565600838
0.9904886565600851

Health News / Gesundheitsnachrichten / PLG3wws6vwyGOxK7vHMDAiAvTm9PnSRJ7T
4780
5556
0.8603311735061195
0.9993407231313579
0.9993407231313578

National News / Nationale Nachrichten / PLNjtpXOAJhQJYbpJxMnoLKCUPanyEfv_j
448
9913
0.04519318067184505
0.9642750850681886
0.964275085068181

News / Nachrichten / PL3ZQ5CpNulQnRmIg0qmrmA-Q8VTLVJNmp
326
10035
0.032486297957149975
0.9549776942161715
0.9549776942161613

Science and Technology News / Meldungen aus Wissenschaft und Technik / PLZ3fbv488-iVNNpXJVGcgv_V4uIXtNSyV
3999
162
24.685185185185187
0.9955138989025075
0.9955138989025074

Sports News / Sportnachrichten / PL4Yp_5ExVAU2cZMOl2we9ArKj7FiNhPGw
369
9991
0.03693323991592433
0.9860222245375417
0.986022