# Youtube Recommender System

### Group Meeting: 12/3/2019

In [5]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import pandas as pd
import nltk

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import json

In [3]:
video_data = pd.read_csv("dataset/USvideos.csv")
video_data.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [15]:
len(video_data)

40949

In [12]:
clean_vid_data = video_data.groupby(["title"]).min().reset_index()

In [13]:
clean_vid_data.head()

Unnamed: 0,title,video_id,trending_date,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed
0,#184 Making a PCB using EasyEDA. // Review,BPmgDhwbd1w,17.07.12,MickMake,28,2017-12-02T14:05:07.000Z,"MickMake|""electronics""|""embedded""|""maker""|""diy...",3237,161,2,35,https://i.ytimg.com/vi/BPmgDhwbd1w/default.jpg,False,False,False
1,"#23 Feed The Homeless | One List, One Life",4qakFfGRV4E,17.01.12,"One List , One Life",22,2017-11-30T15:36:12.000Z,"homeless|""experiment""|""people""|""man""|""singing""...",32385,568,77,97,https://i.ytimg.com/vi/4qakFfGRV4E/default.jpg,False,False,False
2,#57: AM and SSB explained,l7n58h-Zj3I,18.04.01,Radio Physics and Electronics,28,2017-12-24T19:15:29.000Z,"AM|""SSB""|""amplitude modulation""|""single sideba...",3287,79,3,19,https://i.ytimg.com/vi/l7n58h-Zj3I/default.jpg,False,False,False
3,"#AboveTheNoise feat. Serena Williams, Neymar J...",oWithLP0VlQ,17.29.11,Beats by Dre,10,2017-11-22T17:23:20.000Z,[none],2152261,7824,158,324,https://i.ytimg.com/vi/oWithLP0VlQ/default.jpg,False,False,False
4,#DisneyParksLIVE: Fantasy In The Sky New Years...,FQp6F0PjfpI,18.06.01,Disney Parks,24,2018-01-01T05:11:37.000Z,"Fantasy In The Sky New Years Eve Fireworks|""Di...",130416,1812,30,110,https://i.ytimg.com/vi/FQp6F0PjfpI/default.jpg,False,False,False


In [14]:
len(clean_vid_data)

6455

## Part 1: Category Classification Model

In [25]:
def preprocess_text(text):
    text = text.apply(lambda x: x.lower())
    text = text.str.replace(r'[^\w\s]', '')
    return text

In [26]:
def predict_category(X_train, y_train, X_test, vectorizer, clf):
    X_train = vectorizer.fit_transform(X_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(vectorizer.transform(X_test))
    return y_pred

In [27]:
## usage
clean_vid_data.title = preprocess_text(clean_vid_data.title)
X_train, X_test, y_train, y_test = train_test_split(clean_vid_data['title'], 
                                                    clean_vid_data['category_id'], 
                                                    test_size=0.25, 
                                                    shuffle=True
                                                    )
y_pred = predict_category(X_train, y_train, X_test, TfidfVectorizer(), DecisionTreeClassifier())

In [31]:
y_train

1703    22
5795    23
6123    22
308     27
6420     1
        ..
1788    24
3202    24
2190    20
2335    27
830     28
Name: category_id, Length: 4841, dtype: int64

In [30]:
metrics.accuracy_score(y_test, y_pred)

0.42874845105328374

ann

## Part 2: Same Category Video Rankings

annabelle

### 2.1 Ranking Based on Title

### 2.2 Ranking Based on Tags

In [20]:
def create_category_map():
    category_map = {}
    data = {}
    with open('dataset/US_category_id.json', 'r') as outfile:
        data = json.load(outfile)
        for item in data["items"]:
            category_map[item["id"]] = item["snippet"]["title"]
    return category_map
category_map = create_category_map()

In [21]:
category_map

{'1': 'Film & Animation',
 '2': 'Autos & Vehicles',
 '10': 'Music',
 '15': 'Pets & Animals',
 '17': 'Sports',
 '18': 'Short Movies',
 '19': 'Travel & Events',
 '20': 'Gaming',
 '21': 'Videoblogging',
 '22': 'People & Blogs',
 '23': 'Comedy',
 '24': 'Entertainment',
 '25': 'News & Politics',
 '26': 'Howto & Style',
 '27': 'Education',
 '28': 'Science & Technology',
 '29': 'Nonprofits & Activism',
 '30': 'Movies',
 '31': 'Anime/Animation',
 '32': 'Action/Adventure',
 '33': 'Classics',
 '34': 'Comedy',
 '35': 'Documentary',
 '36': 'Drama',
 '37': 'Family',
 '38': 'Foreign',
 '39': 'Horror',
 '40': 'Sci-Fi/Fantasy',
 '41': 'Thriller',
 '42': 'Shorts',
 '43': 'Shows',
 '44': 'Trailers'}

In [16]:
tags = clean_vid_data["tags"]
tags[1]

'homeless|"experiment"|"people"|"man"|"singing"|"prank"|"kids"|"to harvard"|"good"|"nice"'

In [22]:
def find_category_tags(interested_category):
    tags_df = pd.DataFrame()
    for i in range(len(tags)):
        category = clean_vid_data["category_id"][i]
        video_title = clean_vid_data["title"][i]
        if category == interested_category:
            ls = tags[i]
            individual_tags = ls.split("|")
            for it in individual_tags[0:5]:
                if it != '[none]':
                    tags_df = tags_df.append({"Tag Name": it, "Video Title": video_title}, ignore_index=True)
    return tags_df

In [23]:
tags_1 = find_category_tags(1)
tags_1.head()

Unnamed: 0,Tag Name,Video Title
0,call me by your name,'Call Me By Your Name': Which fruit would Elio...
1,"""armie hammer""",'Call Me By Your Name': Which fruit would Elio...
2,"""timothée chalamet""",'Call Me By Your Name': Which fruit would Elio...
3,"""interview""",'Call Me By Your Name': Which fruit would Elio...
4,"""luca guadagnino""",'Call Me By Your Name': Which fruit would Elio...
