In [1]:
import pandas as pd
import numpy as np
import tqdm
import datetime
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from scipy import sparse

pd.set_option("max.columns", None)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("raw_data_com_labels.csv",index_col=0)
df = df[df['y'].notnull()]
df.head()

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description
0,Yanjun Qi,S0-Introduction-Module3: Deep Learning and AI ...,0.0,2020-08-25,UCHMYETgeGbNHVHLidZSV8BQ,22,,,https://i.ytimg.com/vi/LkPmTGw1jqo/hqdefault.j...,1280,672,Science & Technology,Machine Learning,http://www.youtube.com/channel/UCHMYETgeGbNHVH...,Course Web: \nhttps://qiyanjun.github.io/2020f...
1,Ciência dos Dados,Machine Learning no Ensino Médio,0.0,2020-08-25,UCd3ThZLzVDDnKSZMsbK0icg,3,,,https://i.ytimg.com/vi_webp/R_gBq8IfwJc/maxres...,1920,1080,Education,machine learning|data science,http://www.youtube.com/channel/UCd3ThZLzVDDnKS...,"A matemática, sempre ela....\n\nDe uma maneira..."
2,iKennyHD,NBA LIVE 22: EA COULD USE DEEP MACHINE LEARNIN...,0.0,2020-08-25,KennyCallOfDuty,47,,,https://i.ytimg.com/vi/Tix2xon9MSs/maxresdefau...,1920,1080,Gaming,iKennyHD|nba live20|nba live 20|nba 2k20|live2...,http://www.youtube.com/channel/UCGMtoj9V9Go_im...,Wanna Donate? paypal.me/iKennyYT is where you ...
3,Amazon Web Services,Amazon Aurora Machine Learning – SageMaker Int...,0.0,2020-08-25,AmazonWebServices,335,,,https://i.ytimg.com/vi/w-2ip78NxAw/maxresdefau...,1920,1080,Science & Technology,AWS|Amazon Web Services|Cloud|AWS Cloud|Cloud ...,http://www.youtube.com/channel/UCd6MoB9NC6uYN2...,Learn how you can turn relational data into in...
4,"GMRIT, Rajam, AP",Machine Learning and Deep Learning Implementat...,1.0,2020-08-25,UC8g7hz4oXFzXNryt8h1gRPw,1486,,,https://i.ytimg.com/vi/f6XIY_M7FlA/hqdefault.j...,1280,720,People & Blogs,,http://www.youtube.com/channel/UC8g7hz4oXFzXNr...,Resource Person\nMr.S.Aravinth Seshadri\nCerti...


In [3]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']

## 1. Limpeza da Data

In [4]:
df_limpo['date'] = pd.to_datetime(df['upload_date'])

## 2. Limpeza de views

In [5]:
views = df['view_count'].fillna(0)
df_limpo['views'] = views

## 3. Features

In [6]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [7]:
today = datetime.datetime.today().strftime("%Y-%m-%d")
today

'2020-09-02'

In [8]:
features['tempo_desde_pub'] = (pd.to_datetime(today) -  df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = (features['views'] / features['tempo_desde_pub']).round(3)
features.drop(['tempo_desde_pub'], axis=1, inplace=True)

In [9]:
features.sample(10)

Unnamed: 0,views,views_por_dia
264,389,1.975
59,330,36.667
496,251208,355.819
190,194,2.18
96,334,25.692
123,42120,1238.824
159,6025,87.319
415,154144,274.766
208,96289,822.983
376,1070556,2337.459


In [10]:
mask_train = df_limpo['date'] < '2020-03-10'
mask_val = df_limpo['date'] >= '2020-03-10'

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((244, 2), (255, 2), (244,), (255,))

In [11]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

# Min df - minimo de vezes que palavra tem que aparecer pra virar coluna
title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [12]:
title_bow_train.shape, title_bow_val.shape

((244, 235), (255, 235))

In [13]:
title_bow_train.shape

(244, 235)

In [14]:
Xtrain.shape

(244, 2)

In [15]:
title_bow_train.shape

(244, 235)

In [16]:
# Concatenando as variávels numéricas com as geradas pelo TfidfVectorizer
Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])

In [17]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((244, 237), (255, 237))

In [18]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [19]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [20]:
metrics.average_precision_score(yval, p)

0.48191114078764824

In [21]:
metrics.roc_auc_score(yval, p)

0.5975328947368422

### 5 Active Learning

- 70 exemplos que o modelo tenha dificuldade 
- 30 exemplos aleatoriamente

In [22]:
df_unlabeled = pd.read_csv("raw_data_com_labels.csv", index_col=0)
df_unlabeled = df_unlabeled[df_unlabeled['y'].isnull()].dropna(how='all')
df_unlabeled.shape

(976, 15)

In [23]:
df_unlabeled.head(1)

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description
350,PyB TV,Very Basic Math needed for Machine learning in...,,2019-07-28,UCvnhhDKv5takEN412dmVW8g,784,,,https://i.ytimg.com/vi_webp/MkPQ0xIdKVw/maxres...,1280,652,Education,,http://www.youtube.com/channel/UCvnhhDKv5takEN...,This is an introduction to basic math needed f...


In [24]:
df_limpo_u = pd.DataFrame(index=df_unlabeled.index)
df_limpo_u['title'] = df_unlabeled['title']

In [25]:
df_limpo_u['date'] = pd.to_datetime(df_unlabeled['upload_date'])

In [26]:
views = df_unlabeled['view_count'].fillna(0)
df_limpo_u['views'] = views

In [27]:
features_u = pd.DataFrame(index=df_limpo_u.index)
y = df_unlabeled['y'].copy()

In [28]:
features_u['tempo_desde_pub'] = (pd.to_datetime(today) -  df_limpo_u['date']) / np.timedelta64(1, 'D')
features_u['views'] = df_limpo_u['views']
features_u['views_por_dia'] = (features_u['views'] / features_u['tempo_desde_pub']).round(3)
features_u.drop(['tempo_desde_pub'], axis=1, inplace=True)

In [31]:
features_u.sample(5)

Unnamed: 0,views,views_por_dia
1384,1073,2.021
738,104,0.86
902,19676,49.313
1320,27359,62.894
1129,139,1.188


In [33]:
title_u = df_limpo_u['title']
title_bow_u = title_vec.transform(title_u)

In [34]:
title_bow_u

<976x235 sparse matrix of type '<class 'numpy.float64'>'
	with 3746 stored elements in Compressed Sparse Row format>

In [36]:
Xu_wtitle = sparse.hstack([features_u, title_bow_u])

In [37]:
Xu_wtitle

<976x237 sparse matrix of type '<class 'numpy.float64'>'
	with 5686 stored elements in COOrdinate format>

In [38]:
pu = mdl.predict_proba(Xu_wtitle)[: ,1]

In [39]:
df_unlabeled['p'] = pu

In [40]:
df_unlabeled.head(1)

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description,p
350,PyB TV,Very Basic Math needed for Machine learning in...,,2019-07-28,UCvnhhDKv5takEN412dmVW8g,784,,,https://i.ytimg.com/vi_webp/MkPQ0xIdKVw/maxres...,1280,652,Education,,http://www.youtube.com/channel/UCvnhhDKv5takEN...,This is an introduction to basic math needed f...,0.262007


In [69]:
mask_u = (df_unlabeled['p'] >= 0.36) & (df_unlabeled['p'] <= 1. )
mask_u.sum()

68

In [70]:
df_unlabeled[mask_u].head()

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description,p
500,Simplilearn,Machine Learning Basics | What Is Machine Lear...,,2018-09-19,Simplilearn,1388322,,,https://i.ytimg.com/vi_webp/ukzFI9rgwfU/maxres...,1920,1080,Education,machine learning basics|machine learning basic...,http://www.youtube.com/channel/UCsvqVGtbbyHaMo...,This Machine Learning basics video will help y...,0.514285
501,StatQuest with Josh Starmer,Machine Learning Fundamentals: Bias and Variance,,2018-09-17,joshstarmer,365084,,,https://i.ytimg.com/vi_webp/EuBBz3bI-aA/maxres...,1280,720,Education,Joshua Starmer|StatQuest|Machine Learning|Bias...,http://www.youtube.com/channel/UCtYLUTtgS3k1Fg...,Bias and Variance are two fundamental concepts...,0.417
516,Diego Nogare,Como trabalhar em um projeto de Data Science,,2020-08-26,DiegoNogare,84,,,https://i.ytimg.com/vi/_-6TNE3vles/maxresdefau...,1920,1080,Science & Technology,Nogare|Data Platform|Data Science|Big Data|FCI...,http://www.youtube.com/channel/UCtjb-k1uREGXpG...,"Prezados alunos, nesta aula vamos discutir os ...",0.427905
554,Ken Jee,Reviewing Your Data Science Projects - Episode...,,2020-08-24,UCiT9RITQ9PW6BhXK0y2jaeg,3973,,,https://i.ytimg.com/vi_webp/q6Lf2yhvluw/maxres...,3840,2160,People & Blogs,Data Science|Ken Jee|Machine Learning|data sci...,http://www.youtube.com/channel/UCiT9RITQ9PW6Bh...,"In this video, I review Alexander's Data Scien...",0.387913
581,Great Learning,How to Master Python for Data Science | Data S...,,2020-08-23,beaconelearning,5493,,,https://i.ytimg.com/vi/uvOX22w1klw/maxresdefau...,1280,720,Education,Great Learning|Python for data science|Data Sc...,http://www.youtube.com/channel/UCObs0kLIrDjX2L...,Great Learning brings you this live session on...,0.473252


In [72]:
dificeis = df_unlabeled[mask_u]
aleatorias = df_unlabeled[~mask_u].sample(32)

In [75]:
pd.concat([dificeis, aleatorias]).to_csv("active_label1.csv")
dificeis.head()

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description,p
500,Simplilearn,Machine Learning Basics | What Is Machine Lear...,,2018-09-19,Simplilearn,1388322,,,https://i.ytimg.com/vi_webp/ukzFI9rgwfU/maxres...,1920,1080,Education,machine learning basics|machine learning basic...,http://www.youtube.com/channel/UCsvqVGtbbyHaMo...,This Machine Learning basics video will help y...,0.514285
501,StatQuest with Josh Starmer,Machine Learning Fundamentals: Bias and Variance,,2018-09-17,joshstarmer,365084,,,https://i.ytimg.com/vi_webp/EuBBz3bI-aA/maxres...,1280,720,Education,Joshua Starmer|StatQuest|Machine Learning|Bias...,http://www.youtube.com/channel/UCtYLUTtgS3k1Fg...,Bias and Variance are two fundamental concepts...,0.417
516,Diego Nogare,Como trabalhar em um projeto de Data Science,,2020-08-26,DiegoNogare,84,,,https://i.ytimg.com/vi/_-6TNE3vles/maxresdefau...,1920,1080,Science & Technology,Nogare|Data Platform|Data Science|Big Data|FCI...,http://www.youtube.com/channel/UCtjb-k1uREGXpG...,"Prezados alunos, nesta aula vamos discutir os ...",0.427905
554,Ken Jee,Reviewing Your Data Science Projects - Episode...,,2020-08-24,UCiT9RITQ9PW6BhXK0y2jaeg,3973,,,https://i.ytimg.com/vi_webp/q6Lf2yhvluw/maxres...,3840,2160,People & Blogs,Data Science|Ken Jee|Machine Learning|data sci...,http://www.youtube.com/channel/UCiT9RITQ9PW6Bh...,"In this video, I review Alexander's Data Scien...",0.387913
581,Great Learning,How to Master Python for Data Science | Data S...,,2020-08-23,beaconelearning,5493,,,https://i.ytimg.com/vi/uvOX22w1klw/maxresdefau...,1280,720,Education,Great Learning|Python for data science|Data Sc...,http://www.youtube.com/channel/UCObs0kLIrDjX2L...,Great Learning brings you this live session on...,0.473252
