In [113]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [114]:
df = pd.read_csv('rawdata/USvideos.csv')

In [115]:
df.isnull().sum()

video_id                    0
trending_date               0
title                       0
channel_title               0
category_id                 0
publish_time                0
tags                        0
views                       0
likes                       0
dislikes                    0
comment_count               0
thumbnail_link              0
comments_disabled           0
ratings_disabled            0
video_error_or_removed      0
description               570
dtype: int64

In [116]:
df = df.drop(columns=['description', 'video_id', 'thumbnail_link', 'video_error_or_removed'])
df

Unnamed: 0,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
0,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,False,False
1,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,False,False
2,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,False,False
3,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,False,False
4,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
40944,18.14.06,The Cat Who Caught the Laser,AaronsAnimals,15,2018-05-18T13:00:04.000Z,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",1685609,38160,1385,2657,False,False
40945,18.14.06,True Facts : Ant Mutualism,zefrank1,22,2018-05-18T01:00:06.000Z,[none],1064798,60008,382,3936,False,False
40946,18.14.06,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,24,2018-05-18T17:34:22.000Z,I gave safiya nygaard a perfect hair makeover ...,1066451,48068,1032,3992,False,False
40947,18.14.06,How Black Panther Should Have Ended,How It Should Have Ended,1,2018-05-17T17:00:04.000Z,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",5660813,192957,2846,13088,False,False


# Feature Engineering

In [None]:
import seaborn as sns

# feature list
# trending_date	,trending_date, title, channel_title, category_id, publish_time,tags , views, likes , dislikes , comment_count, comments_disabled,ratings_disabled

# engineering publish_time

df['at_what_hour'] = pd.to_datetime(df['publish_time']).dt.hour
df['dayofweek'] = pd.to_datetime(df['publish_time']).dt.dayofweek

# engineering trending_date

df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m').dt.tz_localize('UTC')
df['how_long_till_trending'] = (df['trending_date'] - pd.to_datetime(df['publish_time'])).dt.total_seconds()


# engineering comments_disabled, ratings_disabled

df['comments_disabled'] = df['comments_disabled'].astype(int)
df['ratings_disabled'] = df['ratings_disabled'].astype(int)


df.drop(columns=['publish_time', 'trending_date', 'category_id'], inplace=True)

# Normalize numeric features
numeric_features = ['views', 'likes', 'dislikes', 'comment_count', 'at_what_hour', 'dayofweek', 'how_long_till_trending']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])


Unnamed: 0,title,channel_title,category_id,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,at_what_hour,dayofweek,how_long_till_trending
0,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,SHANtell martin,0.003321,0.010247,0.001771,0.011717,0,0,0.739130,0.000000,0.000196
1,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,"last week tonight trump presidency|""last week ...",0.010738,0.017312,0.003671,0.009330,0,0,0.304348,0.000000,0.000292
2,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",0.014168,0.026013,0.003189,0.006008,0,0,0.826087,1.000000,0.000414
3,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,"rhett and link|""gmm""|""good mythical morning""|""...",0.001521,0.001812,0.000398,0.001576,0,0,0.478261,0.000000,0.000257
4,I Dare You: GOING BALD!?,nigahiga,24,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",0.009303,0.023555,0.001188,0.012866,0,0,0.782609,1.000000,0.000425
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,The Cat Who Caught the Laser,AaronsAnimals,15,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",0.007482,0.006798,0.000827,0.001951,0,0,0.565217,0.666667,0.006405
40945,True Facts : Ant Mutualism,zefrank1,22,[none],0.004726,0.010689,0.000228,0.002891,0,0,0.043478,0.666667,0.006524
40946,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,24,I gave safiya nygaard a perfect hair makeover ...,0.004733,0.008562,0.000616,0.002932,0,0,0.739130,0.666667,0.006360
40947,How Black Panther Should Have Ended,How It Should Have Ended,1,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",0.025133,0.034372,0.001700,0.009612,0,0,0.739130,0.500000,0.006603


In [None]:
# make vector embedding using sentence transformers
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('all-MiniLM-L6-v2')
df['title_embedding'] = model.encode(df['title'].astype(str).tolist(), show_progress_bar=True)
df['channel_title_embedding'] = model.encode(df['channel_title'].astype(str).tolist(), show_progress_bar=True)
df['tags_embedding'] = model.encode(df['tags'].astype(str).tolist(), show_progress_bar=True)


In [118]:
df.drop(columns=['publish_time', 'trending_date'], inplace=True)


KeyError: "['publish_time', 'trending_date'] not found in axis"