In [1]:
import math
import sqlite3
import numpy as np
import json
import urllib
import datetime
import calendar
import re
import sys
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler,MinMaxScaler
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDRegressor
import warnings
warnings.filterwarnings('ignore')



### EXTRACT DATABASE FUNCTION

In [1]:
def extract_database(database_name):
    db_connection = sqlite3.connect(database_name)
    db_crsr = db_connection.cursor()
    X = []
    Y = [] # LikeCount
    
    try:
        db_crsr.execute("Select likeCount,viewCount,commentCount,dislikeCount,duration from Videos")
        rows=db_crsr.fetchall()
        for row in rows:
            Y.append(row[0] if row[0] else 0) #likeCount
            
            # numerical features
            viewCount=row[1] if row[1] else 0
            commentCount=row[2] if row[2] else 0
            dislikeCount=row[3] if row[3] else 0
            duration=row[4] if row[4] else 0
              
            X.append([
                viewCount, 
                commentCount,
                dislikeCount,
                duration,
            ])

    except (sqlite3.OperationalError,e):
        print ('sqlite3.OperationalError:',e)

    db_connection.close()
    return X,Y



### REAL_VIDEO_FETCH FUNCTION

In [4]:
def real_video_fetch(video_id):
   api_key="AIzaSyAkNjqcFjNT86o-m3uloLS-EzhR1aCtlQE"
   numerical_features_real=[]
   
   url = "https://www.googleapis.com/youtube/v3/videos?id=" + video_id + "&key=" + api_key + "&part=status,statistics,contentDetails,snippet"
   response = urllib.request.urlopen(url).read()
   data = json.loads(response)
   all_data = data['items']
   #print (all_data)

   #Snippet
   channelId = all_data[0]['snippet']['channelId']
   channelTitle = all_data[0]['snippet']['channelTitle']
   title = all_data[0]['snippet']['title']
   print ('title :',title)
   description = all_data[0]['snippet']['description']
   category_id = all_data[0]['snippet']['categoryId']
   publishedAt = all_data[0]['snippet']['publishedAt']
   #publishedAt	= int(strict_rfc3339.rfc3339_to_timestamp(publishedAt))
   currentTime	= datetime.datetime.utcnow() # current time as rtf3339
   currentTime	= datetime.datetime.timetuple(currentTime) # current time as timetuple
   currentTime	= calendar.timegm(currentTime) # current time as epoch timestamp
   #life = currentTime - publishedAt

   #Content Details
   defintion = all_data[0]['contentDetails']['definition']
   caption = all_data[0]['contentDetails']['caption']
   licensedContent = all_data[0]['contentDetails']['licensedContent']
   dimension = all_data[0]['contentDetails']['dimension']

   duration = all_data[0]['contentDetails']['duration']
   duration_w = re.search(r"(\d+)w", duration, re.I)
   duration_w = int(duration_w.group(1)) if duration_w else 0
   duration_d = re.search(r"(\d+)d", duration, re.I)
   duration_d = int(duration_d.group(1)) if duration_d else 0
   duration_h = re.search(r"(\d+)h", duration, re.I)
   duration_h = int(duration_h.group(1)) if duration_h else 0
   duration_m = re.search(r"(\d+)m", duration, re.I)
   duration_m = int(duration_m.group(1)) if duration_m else 0
   duration_s = re.search(r"(\d+)s", duration, re.I)
   duration_s = int(duration_s.group(1)) if duration_s else 0
   duration = 0
   duration += duration_w * 7 * 24 * 60 * 60
   duration += duration_d * 24 * 60 * 60
   duration += duration_h * 60 * 60
   duration += duration_m * 60
   duration += duration_s * 1
   durationCategory	= "short"
   durationCategory	= "medium" if duration_m >= 4 else "short"
   durationCategory	= "long" if duration_m >= 20 else "medium"

   try:
	   allowed = ','.join(all_data[0]["contentDetails"]["regionRestriction"]["allowed"])
   except Exception:
	   allowed = None
   try:
	   allowedCount = len(all_data[0]["contentDetails"]["regionRestriction"]["allowed"])
   except Exception:
	   allowedCount = 0

   # recordingDetails
   try:
	   recordingDate = all_data[0]["recordingDetails"]["recordingDate"]
	   recordingDate = int(strict_rfc3339.rfc3339_to_timestamp(recordingDate))
   except Exception:
	   recordingDate = None
   try:
	   latitude = all_data[0]["recordingDetails"]["location"]["latitude"]
   except Exception:
	   latitude = None
   try:
	   longitude = all_data[0]["recordingDetails"]["location"]["longitude"]
   except Exception:
	   longitude = None

   # status
   publicStatsViewable	= int(all_data[0]['status']['publicStatsViewable'])
   privacyStatus = all_data[0]['status']['privacyStatus']
   license	= all_data[0]['status']['license']
   embeddable = int(all_data[0]['status']['embeddable'])

   #Statistics
   commentCount = int(all_data[0]['statistics']['commentCount'])
   viewCount = int(all_data[0]['statistics']['viewCount'])
   favoriteCount = int(all_data[0]['statistics']['favoriteCount'])
   likeCount = int(all_data[0]['statistics']['likeCount'])
   dislikeCount = int(all_data[0]['statistics']['dislikeCount'])



   numerical_features_real.append([
  	  viewCount,
   	  commentCount,
      dislikeCount,
      duration,
      ])

   return (numerical_features_real,likeCount)




### INIT_PREDICTOR METHOD

In [12]:
def init_predictor(videoids):
    X,Y=extract_database('youtube.db')
    scaler = MinMaxScaler(feature_range=(-2,2))
    for videoid in videoids:
        x_test_real,likeCount=real_video_fetch(videoid)
        #Feature Scaling
        numerical_features_total=np.append(X,x_test_real,axis=0)
        
        X_total_scaled=scaler.fit_transform(numerical_features_total) #numerical_features
        x_test_real_scaled=X_total_scaled[X_total_scaled.shape[0]-1]
        x_test_real_scaled=x_test_real_scaled.reshape(1,-1)
        X_scaled=X_total_scaled[:X_total_scaled.shape[0]-1] #removing real
        #diff regressors
        print ('Random Forest: ')
        rfg=RandomForestRegressor()
        rfg.fit(X_scaled,Y)
        print ('video_id : ', videoid)
        print('PREDICTED: ',rfg.predict(x_test_real_scaled))
        print('ACTUAL: ',likeCount,'\n')
        
        lr=LinearRegression()
        lr.fit(X_scaled,Y)
        print ('Linear Regresion: ')
        print ('video_id : ', videoid)
        print('PREDICTED: ',lr.predict(x_test_real_scaled))
        print('ACTUAL: ',likeCount,'\n')
        
        sgd=SGDRegressor()
        sgd.fit(X_scaled,Y)
        print ('Gradient Descent Regresion: ')
        print ('video_id : ', videoid)
        print('PREDICTED: ',sgd.predict(x_test_real_scaled))
        print('ACTUAL: ',likeCount,'\n')
        
        if videoid==videoids[len(videoids)-1]:
                train_test_check(X_scaled,Y)
        
        
    

In [21]:
def train_test_check(x,y):
    xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.9,random_state=2)
    xtrain=np.array(xtrain)
    ytrain=np.array(ytrain)
    ytest=np.array(ytest)
    rfg=RandomForestRegressor(random_state=2)
    lr=LinearRegression()
    sgd=SGDRegressor (max_iter=1500)
    rfg.fit(xtrain,ytrain)
    
    print('\nTRAIN TEST CHECK\nRandom Forest Regressor\nTraining Score:',rfg.score(xtrain,ytrain))
    print('\nTesting Score:',rfg.score(xtest,ytest))
    
    lr.fit(xtrain,ytrain)
    print('Linear Regression\nTraining Score:',lr.score(xtrain,ytrain))
    print('\nTesting Score:',lr.score(xtest,ytest),'\n')
    
    sgd.fit(xtrain,ytrain)
    print('Gradient Descent\nTraining Score:',sgd.score(xtrain,ytrain))
    print('\nTesting Score:',sgd.score(xtest,ytest),'\n')

# CALLING THE PREDICTOR

In [22]:
videoids=['3nA1hmKCRpE','hhdSyBHuI88','KxCjVIFxZNo','Fb0OTqLotxU','cYOB941gyXI']
init_predictor(videoids)

title : Daru Desi (Full Video Song) | Cocktail | Saif Ali Khan, Deepika Padukone & Diana Penty
Random Forest: 
video_id :  3nA1hmKCRpE
PREDICTED:  [ 74008.1]
ACTUAL:  55852 

Linear Regresion: 
video_id :  3nA1hmKCRpE
PREDICTED:  [ 120597.69853973]
ACTUAL:  55852 

Gradient Descent Regresion: 
video_id :  3nA1hmKCRpE
PREDICTED:  [ 74904.91761098]
ACTUAL:  55852 

title : Zaalima - Lyrical | Raees | Shah Rukh Khan & Mahira Khan | Arijit Singh & Harshdeep Kaur | JAM8
Random Forest: 
video_id :  hhdSyBHuI88
PREDICTED:  [ 336401.7]
ACTUAL:  227008 

Linear Regresion: 
video_id :  hhdSyBHuI88
PREDICTED:  [ 312554.1890563]
ACTUAL:  227008 

Gradient Descent Regresion: 
video_id :  hhdSyBHuI88
PREDICTED:  [ 129872.94597626]
ACTUAL:  227008 

title : Lat Lag Gayee - Lyrical Video | Race 2 | Saif Ali Khan, Jacqueline Fernandez | Benny Dayal, Shalmali
Random Forest: 
video_id :  KxCjVIFxZNo
PREDICTED:  [ 318891.4]
ACTUAL:  260132 

Linear Regresion: 
video_id :  KxCjVIFxZNo
PREDICTED:  [ 424244.