In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dtale
from clean import CleanData
import seaborn as sns
import json
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [None]:
# Open file with the raw data
file = open('data/trending.json',
            encoding="utf8")

# Load data as JSON
raw_data = json.load(file)

# Close the original file
file.close()

# Select only the list with the video data
trending_videos_list = raw_data['collector']

# Example of a video object
print(json.dumps(trending_videos_list[15], indent=4, sort_keys=True))

In [None]:
# Create a DataFrame of the data
df_tiktok_dataset = pd.DataFrame(trending_videos_list)

# Let's expand the hashtag cell containing lists to multiple rows
df_tiktok_dataset = df_tiktok_dataset.explode('hashtags').explode('mentions')

In [None]:
# Create a DataFrame of the data
df_tiktok_music = pd.DataFrame(trending_videos_list)

In [None]:
# Create a function
def object_to_columns(dfRow, **kwargs):
    '''Function to expand cells containing dictionaries, to columns'''
    for column, prefix in kwargs.items():
        if isinstance(dfRow[column], dict):
            for key, value in dfRow[column].items():
                columnName = '{}.{}'.format(prefix, key)
                dfRow[columnName] = value
    return dfRow

In [None]:
# Expand certain cells containing dictionaries to columns
df_tiktok_dataset = df_tiktok_dataset.apply(object_to_columns,
                                            authorMeta='authorMeta',
                                            musicMeta='musicMeta',
                                            covers='cover',
                                            videoMeta='videoMeta',
                                            hashtags='hashtag', axis=1)


In [None]:
# Remove the original columns containing the dictionaries
df_tiktok_dataset = df_tiktok_dataset.drop(
    ['authorMeta', 'musicMeta', 'covers', 'videoMeta', 'hashtags'], axis=1)
df_tiktok_dataset

In [None]:
# Get unique rows from dataset
df_unique_videos = df_tiktok_dataset.drop_duplicates(subset='id', keep="first")
df_unique_music = df_tiktok_dataset.drop_duplicates(
    subset='musicMeta.musicId', keep="first")
df_unique_authors = df_tiktok_dataset.drop_duplicates(
    subset='authorMeta.id', keep="first")

# Show amount of rows per dataset
{
    'df_tiktok_dataset': df_tiktok_dataset.shape,
    'df_unique_videos': df_unique_videos.shape,
    'df_unique_music': df_unique_music.shape,
    'df_unique_authors': df_unique_authors.shape
}

# Problem here finding the Audd data

In [None]:
# Import Audd Data
df_audd_music = pd.read_csv(
    '../input/tiktok-trending-december-2020/audd/audd_music.csv', index_col='id')
df_audd_music_apple = pd.read_csv(
    '../input/tiktok-trending-december-2020/audd/audd_music_apple_music.csv')
df_audd_music_spotify = pd.read_csv(
    '../input/tiktok-trending-december-2020/audd/audd_music_spotify_music.csv')
df_audd_music_spotify_artists = pd.read_csv(
    '../input/tiktok-trending-december-2020/audd/audd_music_spotify_music_artists.csv')


In [None]:
# The current version of the dataset contains duplicated rows, let's remove them
df_audd_music = df_audd_music.drop_duplicates()

# Add prefix to this dataset, before merging
df_audd_music = df_audd_music.add_prefix('_audd_music.')
df_audd_music.shape

In [None]:
# Create a DataFrame of the data
df_tiktok_music = pd.DataFrame(trending_videos_list)

# Expand certain cells containing dictionaries to columns
df_tiktok_music = df_tiktok_music.apply(object_to_columns,
                                        musicMeta='musicMeta', axis=1)

# Convert the column dtype to int64 so we can merge
df_tiktok_music['musicMeta.musicId'] = df_tiktok_music['musicMeta.musicId'].astype(
    'int64')
df_tiktok_music.shape

In [None]:
# Merge the Audd and TikTok data
df_tiktok_audd_music = df_tiktok_music.merge(
    df_audd_music, how='left', right_on='id', left_on='musicMeta.musicId')
df_tiktok_audd_music.shape


In [None]:
df_tiktok_audd_music = df_tiktok_audd_music[(
    df_tiktok_audd_music['musicMeta.musicName'] == 'origineel geluid') & df_tiktok_audd_music['_audd_music.artist'].notna()]
df_tiktok_audd_music
