In [1]:
import csv
import pandas as pd
import numpy as np
import sys
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data_new.csv', nrows=30000, encoding='utf-8')
data.viewCount
data.shape

(30000, 12)

# Cleaning

In [3]:
count=0
index_list=[]


for index, row in data.iterrows():
    try:
        row.viewCount = int(row.viewCount)
        row.likeCount = int(row.likeCount)
        row.dislikeCount = int(row.dislikeCount)
        row.favoriteCount = int(row.favoriteCount)
        row.commentCount = int(row.commentCount)
    except:
        #count = count+1
        #print(row.viewCount)
        #data = data.drop(data.index[index])
        index_list.append(index)
        #print(data.shape)
        
        
data = data.drop(data.index[index_list])
#print(count)

In [4]:
data['videoId'] = data['videoId'].astype('str')
data['category'] = data['category'].astype('str')
data['publishedAt'] = data['publishedAt'].astype('datetime64[ns]')
data['description'] = data['description'].astype('str')
data['channelTitle'] = data['channelTitle'].astype('str')
data['viewCount'] = data['viewCount'].astype('float')
data['likeCount'] = data['likeCount'].astype('float')
data['dislikeCount'] = data['dislikeCount'].astype('float')
data['favoriteCount'] = data['favoriteCount'].astype('float')
data['commentCount'] = data['commentCount'].astype('float')

In [5]:
data.drop('channelId', axis=1)
data.drop('favoriteCount', axis=1)
print(data.head())

       videoId        category         publishedAt  \
0  0H_klG0ZWd4           Music 2007-12-31 23:58:35   
1  0Z7X0fy3Ews  Pets & Animals 2017-02-04 23:00:00   
2  0vYIicuAZ6o          Gaming 2016-12-24 20:38:51   
3  1BT1tPbZwg0    Short Movies 2015-12-21 12:38:36   
4  21P_D2pwLXs        Classics 2017-03-30 10:17:38   

                                               title  \
0                     Damien Rice - Delicate (Cover)   
1  Funny moments animals and pets cute animals vi...   
2                                      Point Blank#1   
3  Anti ragging - a short film on college ragging...   
4  Thonet reinterprets classic bentwood 209 chair...   

                                         description  \
0                                                nan   
1  Subscribe: https://goo.gl/MlxSfy TNTL-Tv inclu...   
2                                     Point Blank :)   
3  Give your feedback on https://www.facebook.com...   
4  Philipp Thonet explains how furniture manufact...   

 

# Merging

In [6]:
top_view_data= pd.read_csv('top_view.csv', encoding='utf-8')

In [None]:
top_view_ids = top_view_data['videoId'].astype('str')
top_view_ids.shape

(310,)

In [None]:
count=0
video_list=[]
for index, video in enumerate(data.videoId):
    #print(video)
    if str(video) in str(top_view_ids):
        count= count+1
        video_list.append(index)
        
print(video_list)

In [None]:
#print(data.columns)
top=[]
for video in data.iterrows():
    print((video[1].videoId))
    if video[1].videoId in top_view_ids:
        video[1].top_view = True
    else:
         video[1].top_view= False
print(data.head())

In [None]:
data.to_csv('complete_data.csv')

#  Preprocessing

In [None]:
title_dum = pd.get_dummies(data.title, prefix = 'title_dum')
description_dum = pd.get_dummies(data.description, prefix = 'description_dum')
channelTitle_dum = pd.get_dummies(data.channelTitle, prefix = 'channelTitle_dum')
category_dum = pd.get_dummies(data.category, prefix = 'category_dum')
publishedAt_dum = pd.get_dummies(data.publishedAt, prefix = 'publishedAt_dum')

data = pd.concat([data, title_dum], axis = 1)
data = pd.concat([data, description_dum], axis = 1)
data = pd.concat([data, channelTitle_dum], axis = 1)
data = pd.concat([data, category_dum], axis = 1)
data = pd.concat([data, publishedAt_dum], axis = 1)

In [None]:
data = data.drop('title', axis=1)
data = data.drop('description', axis=1)
data = data.drop('channelTitle', axis=1)
data = data.drop('channelId', axis=1)
data = data.drop('publishedAt', axis=1)
data = data.drop('category', axis=1)
data = data.drop('videoId', axis=1)
data = data.drop('viewCount',axis=1)

print(data.head())

In [None]:
data['top_view'] = ''
data['top_view'] = data['top_view'].astype('bool')
for index in video_list:
    data.top_view[data.index[index]] = True

data.head()

# Modeling

In [None]:
X = data.iloc[:,0:data.shape[1]-1].values
y = data.iloc[:,data.shape[1]-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_data_train, X_data_test, Y_data_train, Y_data_test = train_test_split(X, y, train_size=0.75, random_state = 1)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_data_train, Y_data_train)

In [None]:
importances = clf.feature_importances_
importances=-np.sort(-importances)
importances = importances[:20]
print(importances)

In [None]:
indices=np.argsort(importances)[::-1]

In [None]:
xaxis = data.columns
new_axis=[]
for item in indices:
    new_axis.append(xaxis[item])

# Plotting feature importances

In [None]:
plt.figure(figsize=(25,25))
plt.title("Feature importances",fontsize=10)
plt.bar(range(len(importances)), importances , color="r", align="center",width=0.3)
plt.xticks(range(20),new_axis[0:20],rotation='vertical',fontsize=20)
plt.ylabel("feature-importance",fontsize=25)
plt.show()

# Accuracy

In [None]:
from sklearn.metrics import accuracy_score
predictions = clf.predict(X_data_train)
accuracyTrain = accuracy_score(predictions,Y_data_train)
print("Accuracy from the training data set prediction is",accuracyTrain*100,"%")

In [None]:
predictions = clf.predict(X_data_test)
accuracyTest = accuracy_score(predictions,Y_data_test)
print("Accuracy from the testing data set prediction is",accuracyTest*100,"%")

# Precision Recall score

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_micro,recall_micro,f_beta_score_micro,fscore_micro= precision_recall_fscore_support(Y_data_test, predictions, average='micro')
precision_weighted,recall_weighted,f_beta_score_weighted,fscore_weighted= precision_recall_fscore_support(Y_data_test, predictions, average='weighted')

print("Recall micro: {}".format(recall_micro))
print("Recall weighted: {}".format(recall_weighted))