In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dtale
from clean import CleanData
import seaborn as sns
import json
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
# Open file with the raw data
file = open('data/trending.json',
            encoding="utf8")

# Load data as JSON
raw_data = json.load(file)

# Close the original file
file.close()

# Select only the list with the video data
trending_videos_list = raw_data['collector']

# Example of a video object
print(json.dumps(trending_videos_list[15], indent=4, sort_keys=True))

{
    "authorMeta": {
        "avatar": "https://p77-sign-sg.tiktokcdn.com/tos-maliva-avt-0068/ce5595b5c4874a234d0e41feec9b4ef9~c5_1080x1080.jpeg?x-expires=1608649200&x-signature=HXQeWxkqcbcR2JEz9JRdZWOhJ%2Fw%3D",
        "id": "6563548229581422598",
        "name": "jhullyduarte",
        "nickName": "Jhully Duarte",
        "secUid": "MS4wLjABAAAAByNqbJCtA9PfwY1ICwGomesu7URs6GhushkQeWleqhqsTSRmCKKi1GpUXOWabnbI",
        "signature": "\ud83e\udd2a\ud83e\udd2a",
        "verified": false
    },
    "commentCount": 267,
    "covers": {
        "default": "https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/6416d73acd5a47c0bb04c0982e4b973e?x-expires=1608584400&x-signature=qa5fI0p5HrD71YMq70ImAirJtzI%3D",
        "dynamic": "https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/b45558f5231340cdbf9f34f5574cc98c_1608239182?x-expires=1608584400&x-signature=IWs7ofM62Z2%2FcvyxJhpYLZowafM%3D",
        "origin": "https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/c137bcc58fcc4798b3052

In [3]:
# Create a DataFrame of the data
df_tiktok_dataset = pd.DataFrame(trending_videos_list)

# Let's expand the hashtag cell containing lists to multiple rows
df_tiktok_dataset = df_tiktok_dataset.explode('hashtags').explode('mentions')

In [4]:
# Create a DataFrame of the data
df_tiktok_music = pd.DataFrame(trending_videos_list)

In [5]:
# Create a function
def object_to_columns(dfRow, **kwargs):
    '''Function to expand cells containing dictionaries, to columns'''
    for column, prefix in kwargs.items():
        if isinstance(dfRow[column], dict):
            for key, value in dfRow[column].items():
                columnName = '{}.{}'.format(prefix, key)
                dfRow[columnName] = value
    return dfRow

In [6]:
# Expand certain cells containing dictionaries to columns
df_tiktok_dataset = df_tiktok_dataset.apply(object_to_columns,
                                            authorMeta='authorMeta',
                                            musicMeta='musicMeta',
                                            covers='cover',
                                            videoMeta='videoMeta',
                                            hashtags='hashtag', axis=1)


In [7]:
# Remove the original columns containing the dictionaries
df_tiktok_dataset = df_tiktok_dataset.drop(
    ['authorMeta', 'musicMeta', 'covers', 'videoMeta', 'hashtags'], axis=1)
df_tiktok_dataset

Unnamed: 0,authorMeta.avatar,authorMeta.id,authorMeta.name,authorMeta.nickName,authorMeta.secUid,authorMeta.signature,authorMeta.verified,commentCount,cover.default,cover.dynamic,...,musicMeta.playUrl,playCount,shareCount,text,videoMeta.duration,videoMeta.height,videoMeta.width,videoUrl,videoUrlNoWaterMark,webVideoUrl
0,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,6825540583826768902,ninakleij,Nina,MS4wLjABAAAA1FfFjRMUzr0hX2YPT7pRr7bCPQWa-kU_kV...,don’t tell my instagram I’m here\n19 👸🏼,False,68,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf16-sg.tiktokcdn.com/obj/tos-alisg-v-...,44800,50,Confidence went 📈,15,1024,576,https://v77.tiktokcdn.com/ed1f811617d7b5e18b8d...,,https://www.tiktok.com/@ninakleij/video/690722...
1,https://p16-sign-va.tiktokcdn.com/musically-ma...,6729292817489986566,joeysofo,JoeySofo,MS4wLjABAAAAvkCSTiPWJm7Ctqp7AN3mauS_Bi8tVrbtBg...,Tinder couldn’t help me lose my virginity so I...,False,936,https://p16-sign-sg.tiktokcdn.com/obj/tos-mali...,https://p16-sign-sg.tiktokcdn.com/obj/tos-mali...,...,https://sf77-sg.tiktokcdn.com/obj/musically-ma...,838100,1817,Quiet Zone... follow me on insta: joeysofo. Co...,11,1024,576,https://v77.tiktokcdn.com/ab935f1975cb8b69aebf...,,https://www.tiktok.com/@joeysofo/video/6875468...
2,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,6791901371429913601,jackeyephone,JackJacko,MS4wLjABAAAAsI8XQOceYtnIhIbLZLhvz24tOWdWYavlPe...,Zakelijk 📩 jackeyephone@gmail.com\nInstagram: ...,False,27100,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf16-sg.tiktokcdn.com/obj/tos-alisg-v-...,15300000,21100,Iphone bend test🤗 #tiktok #viral #fyp #iphone ...,19,960,540,https://v21.tiktokcdn.com/video/tos/alisg/tos-...,,https://www.tiktok.com/@jackeyephone/video/689...
2,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,6791901371429913601,jackeyephone,JackJacko,MS4wLjABAAAAsI8XQOceYtnIhIbLZLhvz24tOWdWYavlPe...,Zakelijk 📩 jackeyephone@gmail.com\nInstagram: ...,False,27100,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf16-sg.tiktokcdn.com/obj/tos-alisg-v-...,15300000,21100,Iphone bend test🤗 #tiktok #viral #fyp #iphone ...,19,960,540,https://v21.tiktokcdn.com/video/tos/alisg/tos-...,,https://www.tiktok.com/@jackeyephone/video/689...
2,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,6791901371429913601,jackeyephone,JackJacko,MS4wLjABAAAAsI8XQOceYtnIhIbLZLhvz24tOWdWYavlPe...,Zakelijk 📩 jackeyephone@gmail.com\nInstagram: ...,False,27100,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf16-sg.tiktokcdn.com/obj/tos-alisg-v-...,15300000,21100,Iphone bend test🤗 #tiktok #viral #fyp #iphone ...,19,960,540,https://v21.tiktokcdn.com/video/tos/alisg/tos-...,,https://www.tiktok.com/@jackeyephone/video/689...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999,https://p77-sign-sg.tiktokcdn.com/musically-ma...,6798143079889470469,erinwilliams_1,Erin Williams,MS4wLjABAAAAdEeo7qMCWpjEHQIY8SBaOT4g79XSLKQz0l...,International Dressage Rider for GB 🇬🇧 \nIG: @...,False,874,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf77-sg.tiktokcdn.com/obj/tiktok-obj/7...,309000,235,"The collab you didn’t know you needed, myself ...",14,1022,576,https://v77.tiktokcdn.com/a6a5e4b4310213520be7...,,https://www.tiktok.com/@erinwilliams_1/video/6...
999,https://p77-sign-sg.tiktokcdn.com/musically-ma...,6798143079889470469,erinwilliams_1,Erin Williams,MS4wLjABAAAAdEeo7qMCWpjEHQIY8SBaOT4g79XSLKQz0l...,International Dressage Rider for GB 🇬🇧 \nIG: @...,False,874,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf77-sg.tiktokcdn.com/obj/tiktok-obj/7...,309000,235,"The collab you didn’t know you needed, myself ...",14,1022,576,https://v77.tiktokcdn.com/a6a5e4b4310213520be7...,,https://www.tiktok.com/@erinwilliams_1/video/6...
999,https://p77-sign-sg.tiktokcdn.com/musically-ma...,6798143079889470469,erinwilliams_1,Erin Williams,MS4wLjABAAAAdEeo7qMCWpjEHQIY8SBaOT4g79XSLKQz0l...,International Dressage Rider for GB 🇬🇧 \nIG: @...,False,874,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf77-sg.tiktokcdn.com/obj/tiktok-obj/7...,309000,235,"The collab you didn’t know you needed, myself ...",14,1022,576,https://v77.tiktokcdn.com/a6a5e4b4310213520be7...,,https://www.tiktok.com/@erinwilliams_1/video/6...
999,https://p77-sign-sg.tiktokcdn.com/musically-ma...,6798143079889470469,erinwilliams_1,Erin Williams,MS4wLjABAAAAdEeo7qMCWpjEHQIY8SBaOT4g79XSLKQz0l...,International Dressage Rider for GB 🇬🇧 \nIG: @...,False,874,https://p16-sign-sg.tiktokcdn.com/tos-alisg-p-...,https://p16-sign-sg.tiktokcdn.com/obj/tos-alis...,...,https://sf77-sg.tiktokcdn.com/obj/tiktok-obj/7...,309000,235,"The collab you didn’t know you needed, myself ...",14,1022,576,https://v77.tiktokcdn.com/a6a5e4b4310213520be7...,,https://www.tiktok.com/@erinwilliams_1/video/6...


In [8]:
# Get unique rows from dataset
df_unique_videos = df_tiktok_dataset.drop_duplicates(subset='id', keep="first")
df_unique_music = df_tiktok_dataset.drop_duplicates(
    subset='musicMeta.musicId', keep="first")
df_unique_authors = df_tiktok_dataset.drop_duplicates(
    subset='authorMeta.id', keep="first")

# Show amount of rows per dataset
{
    'df_tiktok_dataset': df_tiktok_dataset.shape,
    'df_unique_videos': df_unique_videos.shape,
    'df_unique_music': df_unique_music.shape,
    'df_unique_authors': df_unique_authors.shape
}

{'df_tiktok_dataset': (5693, 37),
 'df_unique_videos': (1000, 37),
 'df_unique_music': (907, 37),
 'df_unique_authors': (802, 37)}

In [10]:
# Import Audd Data
df_audd_music = pd.read_csv('data/audd_music.csv', index_col='id')
df_audd_music_apple = pd.read_csv('data/audd_music_apple_music.csv')
df_audd_music_spotify = pd.read_csv('data/audd_music_spotify_music.csv')
df_audd_music_spotify_artists = pd.read_csv('data/audd_music_spotify_music_artists.csv')


In [11]:
# The current version of the dataset contains duplicated rows, let's remove them
df_audd_music = df_audd_music.drop_duplicates()

# Add prefix to this dataset, before merging
df_audd_music = df_audd_music.add_prefix('_audd_music.')
df_audd_music.shape

(471, 9)

In [12]:
# Create a DataFrame of the data
df_tiktok_music = pd.DataFrame(trending_videos_list)

# Expand certain cells containing dictionaries to columns
df_tiktok_music = df_tiktok_music.apply(object_to_columns,
                                        musicMeta='musicMeta', axis=1)

# Convert the column dtype to int64 so we can merge
df_tiktok_music['musicMeta.musicId'] = df_tiktok_music['musicMeta.musicId'].astype(
    'int64')
df_tiktok_music.shape

(1000, 25)

In [13]:
# Merge the Audd and TikTok data
df_tiktok_audd_music = df_tiktok_music.merge(
    df_audd_music, how='left', right_on='id', left_on='musicMeta.musicId')
df_tiktok_audd_music.shape


(1000, 34)

In [14]:
df_tiktok_audd_music = df_tiktok_audd_music[(
    df_tiktok_audd_music['musicMeta.musicName'] == 'origineel geluid') & df_tiktok_audd_music['_audd_music.artist'].notna()]
df_tiktok_audd_music


Unnamed: 0,id,text,createTime,authorMeta,musicMeta,covers,webVideoUrl,videoUrl,videoUrlNoWaterMark,videoMeta,...,musicMeta.coverLarge,_audd_music.artist,_audd_music.title,_audd_music.album,_audd_music.release_date,_audd_music.label,_audd_music.timecode,_audd_music.song_link,_audd_music.apple_music.isrc,_audd_music.spotify.id
6,6895303013867539713,Oh no,1605437840,"{'id': '6879814870579512326', 'secUid': 'MS4wL...","{'musicId': '6893870343761496834', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@milanvannleeuwen/video...,https://v77.tiktokcdn.com/51d223926618e0839ece...,,"{'height': 1024, 'width': 576, 'duration': 11}",...,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,LPTHERAPPER,Lmss,Lmss,2018-10-15,AK Noise,00:17,https://lis.tn/Lmss,,
150,6889831681469975810,Fishikta ameley❤️#eritreanmusic#habeshatiktok#...,1604163946,"{'id': '6775997579706008582', 'secUid': 'MS4wL...","{'musicId': '6881382384315419394', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@nardosabrahale/video/6...,https://v77.tiktokcdn.com/a2cabbf8ac7d8910a355...,,"{'height': 1024, 'width': 576, 'duration': 15}",...,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,Abraham Afewerki,Semay,Semay,2006-06-01,Negarit Production,01:36,https://lis.tn/Semay,,
202,6883565215552654593,we surprised my mom with her dream car for her...,1602704922,"{'id': '6781188628154942469', 'secUid': 'MS4wL...","{'musicId': '6883565213174598401', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@curlsnclouds/video/688...,https://v77.tiktokcdn.com/ce8dacbe8736107f2db1...,,"{'height': 1024, 'width': 576, 'duration': 54}",...,https://p16-sign-sg.tiktokcdn.com/musically-ma...,Ritt Momney,Put Your Records On,Put Your Records On,2020-04-24,QuarterZip,00:20,https://lis.tn/PutYourRecordsOn,,
242,6876145412105899265,De trend na doen toch 🙃🙃 🏐🏐🏐#volleybal #traini...,1600977365,"{'id': '6725806948447060997', 'secUid': 'MS4wL...","{'musicId': '6875389627230915330', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@detwensechicks/video/6...,https://v19.tiktokcdn.com/0a2ac02f6301fa9975c3...,,"{'height': 1024, 'width': 576, 'duration': 11}",...,https://p16-sign-sg.tiktokcdn.com/musically-ma...,Lehambar Hussainpuri,Sadi Gali,Tanu Weds Manu,2011-02-02,T-Series,00:51,https://lis.tn/SadiGali,,1tEto4JrqNmBZFH5uAiYqb
297,6884691950478298370,🦋🦋🦋🦋🦋vlinders,1602967260,"{'id': '56906553353601024', 'secUid': 'MS4wLjA...","{'musicId': '6864179215273659141', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@maximeee.r/video/68846...,https://v77.tiktokcdn.com/38aae384548de6ad3c93...,,"{'height': 1024, 'width': 576, 'duration': 7}",...,https://p16-sign-sg.tiktokcdn.com/musically-ma...,Kris Kross Amsterdam,Mij Niet Eens Gezien,Mij Niet Eens Gezien,2020-07-24,WMG - Spinnin' Records (Distribution),02:33,https://lis.tn/MijNietEensGezien,NLZ542001176,39X7P5VjmG0zk8efBJL2HD
298,6888218363923942657,Legend dat ie dat durft,1603788318,"{'id': '6592928888640536582', 'secUid': 'MS4wL...","{'musicId': '6888218379384064769', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@waaromjaron/video/6888...,https://v77.tiktokcdn.com/664f938bc427587e8d5f...,,"{'height': 1024, 'width': 576, 'duration': 24}",...,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,Fernando Velázquez,The Monster Wakes Up,A Monster Calls (Original Motion Picture Sound...,2016-12-09,Back Lot Music,02:53,https://lis.tn/TheMonsterWakesUp,USQ4E1602375,1XkvBIOIf4up5aTusU0vf2
334,6898350532415540482,#soundwavestattoo #spotifytattoo #tattoo #amst...,1606147396,"{'id': '6789291619143123974', 'secUid': 'MS4wL...","{'musicId': '6898350581761477377', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@mstar.ink/video/689835...,https://v33.tiktokcdn.com/3f7710e225edc9013019...,,"{'height': 1020, 'width': 576, 'duration': 14}",...,https://p16-sign-sg.tiktokcdn.com/musically-ma...,Stef Bos,Papa,Is Dit Nu Later,1990-03-16,Hkm Records nv,00:03,https://lis.tn/mKsZTP,BEJ019000009,6NP6BCW2M2I4vdcnXMAvjl
347,6876860979787959554,#fy #voorjou #foryou #fördich #covid #quarantine,1601143970,"{'id': '6811941079287743494', 'secUid': 'MS4wL...","{'musicId': '6876860989241838338', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@vooraluwlekkerefilmjos...,https://v77.tiktokcdn.com/9977049fdcd9b8c45d61...,,"{'height': 1024, 'width': 576, 'duration': 13}",...,https://p16-sign-sg.tiktokcdn.com/musically-ma...,The Love Unlimited Orchestra,Love's Theme,100 Essential Hits - 70s,2011-01-21,UMG - Brunswick,00:55,https://lis.tn/xsWYnp,USUMG0000356,4VpLTZ81muzN8ixvIvhmAt
348,6895338661622123777,Follow the chain #bottle #viral #bottleflipcha...,1605446140,"{'id': '6843492627725059078', 'secUid': 'MS4wL...","{'musicId': '6895338668958042882', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@whassup_bro/video/6895...,https://v77.tiktokcdn.com/52650d3235aa85e994d6...,,"{'height': 1024, 'width': 576, 'duration': 17}",...,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,Norman Price,Gdzie Jest Biały Węgorz (Zejście),Narkotyki Są Nielegalne,2015-12-03,Gamellon,00:14,https://lis.tn/cuVkQ,,
487,6903854177072499969,Ruut doet mee aan de hype 😂 #foryou #fyp #voor...,1607428817,"{'id': '6652957483734974469', 'secUid': 'MS4wL...","{'musicId': '6903854187734452993', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@npo3nl/video/690385417...,https://v77.tiktokcdn.com/b532cfa208f3160ec203...,,"{'height': 1024, 'width': 576, 'duration': 14}",...,https://p16-sign-sg.tiktokcdn.com/aweme/1080x1...,Rolf Sanchez,Más Más Más,Más Más Más,2020-07-17,8ball Music,00:55,https://lis.tn/M%C3%A1sM%C3%A1sM%C3%A1s,NLZ292000124,28hEtjNvlDhtb38fgXsLRa


In [15]:
df_tiktok_audd_music.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 6 to 994
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            25 non-null     object
 1   text                          25 non-null     object
 2   createTime                    25 non-null     int64 
 3   authorMeta                    25 non-null     object
 4   musicMeta                     25 non-null     object
 5   covers                        25 non-null     object
 6   webVideoUrl                   25 non-null     object
 7   videoUrl                      25 non-null     object
 8   videoUrlNoWaterMark           25 non-null     object
 9   videoMeta                     25 non-null     object
 10  diggCount                     25 non-null     int64 
 11  shareCount                    25 non-null     int64 
 12  playCount                     25 non-null     int64 
 13  commentCount         

In [18]:
# Converting Apple Music and Spotify columns into binary variables
## Create new columns
df_tiktok_audd_music['spotify'] = ''
df_tiktok_audd_music['apple_music'] = ''


In [19]:
# Check to see if columns were made
df_tiktok_audd_music.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 6 to 994
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            25 non-null     object
 1   text                          25 non-null     object
 2   createTime                    25 non-null     int64 
 3   authorMeta                    25 non-null     object
 4   musicMeta                     25 non-null     object
 5   covers                        25 non-null     object
 6   webVideoUrl                   25 non-null     object
 7   videoUrl                      25 non-null     object
 8   videoUrlNoWaterMark           25 non-null     object
 9   videoMeta                     25 non-null     object
 10  diggCount                     25 non-null     int64 
 11  shareCount                    25 non-null     int64 
 12  playCount                     25 non-null     int64 
 13  commentCount         

In [23]:
df_tiktok_audd_music['_audd_music.apple_music.isrc'] = df_tiktok_audd_music['_audd_music.apple_music.isrc'].fillna(0)


In [24]:
df_tiktok_audd_music['_audd_music.spotify.id'] = df_tiktok_audd_music['_audd_music.spotify.id'].fillna(0)

In [26]:
# Apple Music column
df_tiktok_audd_music['apple_music'] = np.where(
    df_tiktok_audd_music['_audd_music.apple_music.isrc'] == 0, 0, 1)


In [27]:
# Spotify column
df_tiktok_audd_music['spotify'] = np.where(
    df_tiktok_audd_music['_audd_music.spotify.id'] == 0, 0, 1)

In [29]:
df_tiktok_audd_music.head(5)

Unnamed: 0,id,text,createTime,authorMeta,musicMeta,covers,webVideoUrl,videoUrl,videoUrlNoWaterMark,videoMeta,...,_audd_music.title,_audd_music.album,_audd_music.release_date,_audd_music.label,_audd_music.timecode,_audd_music.song_link,_audd_music.apple_music.isrc,_audd_music.spotify.id,spotify,apple_music
6,6895303013867539713,Oh no,1605437840,"{'id': '6879814870579512326', 'secUid': 'MS4wL...","{'musicId': '6893870343761496834', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@milanvannleeuwen/video...,https://v77.tiktokcdn.com/51d223926618e0839ece...,,"{'height': 1024, 'width': 576, 'duration': 11}",...,Lmss,Lmss,2018-10-15,AK Noise,00:17,https://lis.tn/Lmss,0,0,0,0
150,6889831681469975810,Fishikta ameley❤️#eritreanmusic#habeshatiktok#...,1604163946,"{'id': '6775997579706008582', 'secUid': 'MS4wL...","{'musicId': '6881382384315419394', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@nardosabrahale/video/6...,https://v77.tiktokcdn.com/a2cabbf8ac7d8910a355...,,"{'height': 1024, 'width': 576, 'duration': 15}",...,Semay,Semay,2006-06-01,Negarit Production,01:36,https://lis.tn/Semay,0,0,0,0
202,6883565215552654593,we surprised my mom with her dream car for her...,1602704922,"{'id': '6781188628154942469', 'secUid': 'MS4wL...","{'musicId': '6883565213174598401', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@curlsnclouds/video/688...,https://v77.tiktokcdn.com/ce8dacbe8736107f2db1...,,"{'height': 1024, 'width': 576, 'duration': 54}",...,Put Your Records On,Put Your Records On,2020-04-24,QuarterZip,00:20,https://lis.tn/PutYourRecordsOn,0,0,0,0
242,6876145412105899265,De trend na doen toch 🙃🙃 🏐🏐🏐#volleybal #traini...,1600977365,"{'id': '6725806948447060997', 'secUid': 'MS4wL...","{'musicId': '6875389627230915330', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@detwensechicks/video/6...,https://v19.tiktokcdn.com/0a2ac02f6301fa9975c3...,,"{'height': 1024, 'width': 576, 'duration': 11}",...,Sadi Gali,Tanu Weds Manu,2011-02-02,T-Series,00:51,https://lis.tn/SadiGali,0,1tEto4JrqNmBZFH5uAiYqb,1,0
297,6884691950478298370,🦋🦋🦋🦋🦋vlinders,1602967260,"{'id': '56906553353601024', 'secUid': 'MS4wLjA...","{'musicId': '6864179215273659141', 'musicName'...",{'default': 'https://p16-sign-sg.tiktokcdn.com...,https://www.tiktok.com/@maximeee.r/video/68846...,https://v77.tiktokcdn.com/38aae384548de6ad3c93...,,"{'height': 1024, 'width': 576, 'duration': 7}",...,Mij Niet Eens Gezien,Mij Niet Eens Gezien,2020-07-24,WMG - Spinnin' Records (Distribution),02:33,https://lis.tn/MijNietEensGezien,NLZ542001176,39X7P5VjmG0zk8efBJL2HD,1,1


In [65]:
# EDA comparing like count and if its on popular streaming services
df_tiktok_audd_music['spotify'].value_counts()


1    17
0     8
Name: spotify, dtype: int64

In [66]:
df_tiktok_audd_music['apple_music'].value_counts()


1    16
0     9
Name: apple_music, dtype: int64

In [33]:
df_tiktok_audd_music[["spotify", "diggCount"]
         ].groupby("spotify").mean()


Unnamed: 0_level_0,diggCount
spotify,Unnamed: 1_level_1
0,9546.375
1,35873.705882


In [35]:
df_tiktok_audd_music[["apple_music", "diggCount"]
                     ].groupby("apple_music").mean()

Unnamed: 0_level_0,diggCount
apple_music,Unnamed: 1_level_1
0,6859.777778
1,39030.375


In [37]:
df_tiktok_audd_music.groupby(["apple_music", "spotify"])["diggCount"].mean()

apple_music  spotify
0            0           8524.428571
             1           1033.500000
1            0          16700.000000
             1          40519.066667
Name: diggCount, dtype: float64

In [38]:
# Adding Matt EDA script to merge datasets
data = json.load(open('./data/trending.json', encoding="utf8"))
data = data['collector']
toParse = pd.json_normalize(data)
df = CleanData(toParse)
df.process_the_data()
df.summary_of_data()

Missing Data:
None

Data Time Period:
Start Date: 2020-09-22 14:22:38
  End Date: 2020-12-21 03:18:36
 Timedelta: 89 days 12:55:58


In [39]:
df_clean = df.dfm
df_clean.sort_values(by=['likeCount'], ascending=False).head(3).T

Unnamed: 0,947,349,706
id,6894081763379924229,6885766692627107077,6890571273110392065
text,#TimeWarpScan,Beatbox keeping me sane before my flight 🗣✈️,How to draw hand #artchallenge #drawing
createTime,2020-11-12 03:58:55,2020-10-20 18:12:03,2020-11-02 16:55:47
webVideoUrl,https://www.tiktok.com/@billieeilish/video/689...,https://www.tiktok.com/@spencerx/video/6885766...,https://www.tiktok.com/@condsty/video/68905712...
likeCount,31000000,5200000,5000000
shareCount,220100,31600,110900
playCount,250800000,44600000,68700000
commentCount,625700,32000,12200
downloaded,True,True,True
mentions,[],[],[]


In [40]:
# setting binary variables for verified authors and original sound
df_clean["authorMeta.verified"] = df_clean["authorMeta.verified"].astype(int)
df_clean["musicMeta.musicOriginal"] = df_clean["musicMeta.musicOriginal"].astype(
    int)

In [69]:
df_merged = pd.merge(df_clean, df_tiktok_audd_music, how = "left", on="id")


In [71]:
df_merged.head(5)

Unnamed: 0,id,text_x,createTime_x,webVideoUrl_x,likeCount,shareCount_x,playCount_x,commentCount_x,downloaded_x,mentions_x,...,_audd_music.title,_audd_music.album,_audd_music.release_date,_audd_music.label,_audd_music.timecode,_audd_music.song_link,_audd_music.apple_music.isrc,_audd_music.spotify.id,spotify,apple_music
0,6907228749016714497,Confidence went 📈,2020-12-17 14:15:17,https://www.tiktok.com/@ninakleij/video/690722...,3710,50,44800,68,True,[],...,,,,,,,,,,
1,6875468410612993286,Quiet Zone... follow me on insta: joeysofo. Co...,2020-09-23 00:09:23,https://www.tiktok.com/@joeysofo/video/6875468...,55700,1817,838100,936,True,[@dwight_schnuute],...,,,,,,,,,,
2,6898699405898059010,Iphone bend test🤗 #tiktok #viral #fyp #iphone ...,2020-11-24 14:37:05,https://www.tiktok.com/@jackeyephone/video/689...,936200,21100,15300000,27100,True,[],...,,,,,,,,,,
3,6902819837345533186,,2020-12-05 17:06:27,https://www.tiktok.com/@naomivaneeren/video/69...,12900,197,94900,143,True,[],...,,,,,,,,,,
4,6905635666588192002,小技です👟✨#tiktok教室#tutorial,2020-12-13 07:13:20,https://www.tiktok.com/@io.dreamer_mk/video/69...,8805,198,115300,52,True,[],...,,,,,,,,,,


In [59]:
df_merged_2 = pd.merge(df_clean, df_tiktok_audd_music, how='inner',
                       left_index=True, right_index=True, suffixes=('', '_drop'))
# Doesn't work because it doesn't drop the columns I want it to. 


In [58]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 70 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   id                            25 non-null     object        
 1   text_x                        25 non-null     object        
 2   createTime_x                  25 non-null     datetime64[ns]
 3   webVideoUrl_x                 25 non-null     object        
 4   likeCount                     25 non-null     int64         
 5   shareCount_x                  25 non-null     int64         
 6   playCount_x                   25 non-null     int64         
 7   commentCount_x                25 non-null     int64         
 8   downloaded_x                  25 non-null     bool          
 9   mentions_x                    25 non-null     object        
 10  hashtags_x                    25 non-null     object        
 11  authorMeta.id                 25 n

In [60]:
df_merged_2.drop(
    [col for col in df_merged.columns if 'drop' in col], axis=1, inplace=True)


In [61]:
df_merged_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 6 to 994
Data columns (total 71 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   id                            25 non-null     object        
 1   text                          25 non-null     object        
 2   createTime                    25 non-null     datetime64[ns]
 3   webVideoUrl                   25 non-null     object        
 4   likeCount                     25 non-null     int64         
 5   shareCount                    25 non-null     int64         
 6   playCount                     25 non-null     int64         
 7   commentCount                  25 non-null     int64         
 8   downloaded                    25 non-null     bool          
 9   mentions                      25 non-null     object        
 10  hashtags                      25 non-null     object        
 11  authorMeta.id                 25 

In [73]:
df_merged_2.groupby(["apple_music", "spotify", "musicMeta.musicOriginal"])[
    "likeRate"].mean()


apple_music  spotify  musicMeta.musicOriginal
0            0        1                          0.072623
             1        1                          0.017265
1            0        1                          0.076430
             1        1                          0.063280
Name: likeRate, dtype: float64

In [74]:
df_merged['apple_music'] = df_merged['apple_music'].fillna(0)
df_merged['spotify'] = df_merged['spotify'].fillna(0)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 70 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   id                            1000 non-null   object        
 1   text_x                        1000 non-null   object        
 2   createTime_x                  1000 non-null   datetime64[ns]
 3   webVideoUrl_x                 1000 non-null   object        
 4   likeCount                     1000 non-null   int64         
 5   shareCount_x                  1000 non-null   int64         
 6   playCount_x                   1000 non-null   int64         
 7   commentCount_x                1000 non-null   int64         
 8   downloaded_x                  1000 non-null   bool          
 9   mentions_x                    1000 non-null   object        
 10  hashtags_x                    1000 non-null   object        
 11  authorMeta.id                 1

In [79]:
df_merged['spotify'] = df_merged.spotify.astype(object)
df_merged['apple_music'] = df_merged.apple_music.astype(object)

In [81]:
df_merged.groupby(["apple_music", "spotify", "musicMeta.musicOriginal_x"])[
    "likeRate"].mean()


apple_music  spotify  musicMeta.musicOriginal_x
0.0          0.0      0                            0.087376
                      1                            0.104654
             1.0      1                            0.017265
1.0          0.0      1                            0.076430
             1.0      1                            0.063280
Name: likeRate, dtype: float64

In [77]:
df_merged[["musicMeta.musicOriginal_x", "likeRate"]
          ].groupby("musicMeta.musicOriginal_x").mean()


Unnamed: 0_level_0,likeRate
musicMeta.musicOriginal_x,Unnamed: 1_level_1
0,0.087376
1,0.10355


In [85]:
df_merged['musicMeta.musicAuthor_x'].unique()

array(['Pop Smoke', 'rapidsongs ', 'Sigurd Barrett', 'Noah', 'Mj.无名氏',
       'Sam Smith', '🌹Rosalie🌹', 'Moonpie', 'Imagine Dragons', 'Lauren',
       'The Kid LAROI', 'mememeiland', 'Elvis Presley', 'JVLES', '0',
       'Jhully Duarte', 'Kattyyy🍁', '🌸Ri Za🌸', 'Nisandu kaushalya💖',
       'adam :)', 'Marc', 'Unknown', 'Dua Lipa', 'Selena ',
       'Jason Wolbert', 'les', '1xbrock', '𝘽𝙮.𝙨𝙝𝙖𝙝𝙤𝙧𝙞', 'Kreepa',
       'Ahmad Baig', 'Elliot Van Coup', 'Rizwan Durrani', '⚡Acacia⚡',
       ' 🔥 SK ❤️', 'Ernie Senpai', 'Mason Schlang', 'Zak Langford',
       'Trevor Daniel', 'Jibran Khan', 'Brock Hutchinson ', 'Krotik',
       'ninichan🦋', 'MC Madan', 'PIKI', 'Armando Ackerman',
       'жена эйдана 🥺☂️🌠', '60 Second Sounds🦋', 'MARIYA', 'ᔕIᑎEᗩᗪ',
       'Sonnelebensfroh', 'Riovacci', 'TroyBoi', 'Jinx ', 'jose',
       'Holly Henry', 'tyana', 'favsoundds ', 'lindsn21',
       'Chri$tian Gate$', 'Sajjad Hamayun', 'kal :)', 'sarah ',
       'Dominik Hauser', 'Naomy. 🦋', 'Danny backup', 'kim.sahar',
 

In [86]:
print(sorted(df_merged['musicMeta.musicAuthor_x'].unique()))


[' Skrillex / Nero', ' 🔥 SK ❤️', '#PovSounds', '#fypsounds', '(Null)', '(⊃｡•́‿•̀｡)⊃', '.', '0', '1xbrock', '24KGoldn', '5 Seconds of Summer', '60 Second Sounds🦋', '7eer', ':(', ':)', 'A', 'A Boogie Wit Da Hoodie', 'A.K', 'Aaron Smith ', 'Abdellatif Ouisa ', 'Abdul_Khaliq2k20', 'AbigailJ', 'Adam Tahere', 'Addison B', 'Adrian Jonathan', 'Ahmad Baig', 'Aidan :)', 'Akatsuki', 'Akon', 'Alex Mac', 'Alex hampstead', 'Alex Øros', 'Alexandra', 'Alexandra Karadimas', 'Ali Berke', 'Alice', 'Alin Trần Hair Salon', 'Allison😗', 'Aly & AJ', 'Amyog3neofficial', 'Ander Huang & DJ Kuromi', 'Andra Day', 'Andrea Borromeo', 'Andrea Parascandolo', 'Andrew Burdge', 'Andy Arthur Smith', 'Angelo', 'Anita Luimes', 'Anuja Ranasinghe', 'Anushka Devruwan', 'Apollo Fresh', 'Ariana Grande', 'Armando Ackerman', 'Armin van Buuren', 'Ashley Kb', 'Astep', 'Atenas Novoa', 'Athaa Creations', 'Austin Ong', 'Avery', 'Ayad', 'Aydon Holley', 'Aye yo', 'BEA LOPES', 'BIBI_2020', 'BILLIE EILISH', 'BINTANG MT REAL', 'BLVKSHP', 'B

In [91]:
df_merged["musicMeta.musicAuthor_x"].value_counts().head(20)


Tim Thebodeau          17
Billie Eilish          10
Kreepa                  8
Pop Smoke               6
Evan Holmes             6
Riovacci                6
Tik Toker               6
mo.rab                  6
Aly & AJ                5
Unknown                 5
zopepijn                5
F1 moments              5
Nathan Freihofer        4
Sigurd Barrett          4
Caín Guzmán             4
The Black Eyed Peas     4
Mj.无名氏                  4
kyle thomas ✌️          4
rapidsongs              4
The Kid LAROI           4
Name: musicMeta.musicAuthor_x, dtype: int64

In [92]:
top_author = df_merged["musicMeta.musicAuthor_x"].value_counts().head(20)


In [107]:
music_meta = df_merged[["likeCount", "likeRate", "musicMeta.musicAuthor_x",
                      "musicMeta.musicOriginal_x", "spotify", "apple_music"]]


In [108]:
music_meta["musicMeta.musicAuthor_x"].value_counts().head(20)


Tim Thebodeau          17
Billie Eilish          10
Kreepa                  8
Pop Smoke               6
Evan Holmes             6
Riovacci                6
Tik Toker               6
mo.rab                  6
Aly & AJ                5
Unknown                 5
zopepijn                5
F1 moments              5
Nathan Freihofer        4
Sigurd Barrett          4
Caín Guzmán             4
The Black Eyed Peas     4
Mj.无名氏                  4
kyle thomas ✌️          4
rapidsongs              4
The Kid LAROI           4
Name: musicMeta.musicAuthor_x, dtype: int64

In [109]:
music_meta.groupby(["musicMeta.musicAuthor_x", "musicMeta.musicOriginal_x"])[
    "likeCount"].mean()

musicMeta.musicAuthor_x  musicMeta.musicOriginal_x
 Skrillex / Nero         0                             3230.0
 🔥 SK ❤️                 1                              368.0
#PovSounds               1                             8639.0
#fypsounds               0                            23900.0
(Null)                   1                            46900.0
                                                       ...   
🤍                        1                             1194.0
🥀@_Achi_🥀                1                             3450.0
🦋❤️🥺🧸🌹💕👑😈💯               1                             9588.0
🩹                        1                             6103.0
🪐☁️✨🌻                    1                             9099.0
Name: likeCount, Length: 801, dtype: float64

In [97]:
like_count = df_merged.groupby(["apple_music", "spotify", "musicMeta.musicAuthor_x"])[
    "likeCount"].mean()
