In [1]:
import pandas as pd
import numpy as np
import tqdm
import datetime
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from scipy import sparse

pd.set_option("max.columns", None)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Resultados do Active Learning

In [2]:
df1 = pd.read_csv("raw_data_com_labels.csv", index_col=0)
df1 = df1[df1['y'].notnull()]
df1.shape

(499, 15)

In [3]:
df2 = pd.read_csv("active_label1.csv", index_col=0)
df2 = df2[df2['y'].notnull()]
df2['novo'] = 1
df2.shape

(100, 17)

In [4]:
metrics.average_precision_score(df2['y'], df2['p']), metrics.roc_auc_score(df2['y'], df2['p'])

(0.537660540958396, 0.6624331550802138)

In [5]:
df = pd.concat([df1, df2.drop('p', axis=1)])
df.head(1)

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description,novo
0,Yanjun Qi,S0-Introduction-Module3: Deep Learning and AI ...,0.0,2020-08-25,UCHMYETgeGbNHVHLidZSV8BQ,22,,,https://i.ytimg.com/vi/LkPmTGw1jqo/hqdefault.j...,1280,672,Science & Technology,Machine Learning,http://www.youtube.com/channel/UCHMYETgeGbNHVH...,Course Web: \nhttps://qiyanjun.github.io/2020f...,


In [6]:
today = datetime.datetime.today().strftime("%Y-%m-%d")
today

'2020-09-09'

In [7]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']
df_limpo['novo'] = df['novo'].fillna(0)
df_limpo['date'] = pd.to_datetime(df['upload_date'])
views = df['view_count'].fillna(0)
df_limpo['views'] = views
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()
features['tempo_desde_pub'] = (pd.to_datetime(today) -  df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = (features['views'] / features['tempo_desde_pub']).round(3)
features.drop(['tempo_desde_pub'], axis=1, inplace=True)

In [8]:
features.head()

Unnamed: 0,views,views_por_dia
0,22,1.467
1,3,0.2
2,47,3.133
3,335,22.333
4,1486,99.067


### Aumenta validação

In [9]:
mask_train = (df_limpo['date'] < '2020-03-10') & (df_limpo['novo'] == 0)
mask_val = (df_limpo['date'] >= '2020-03-10')

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((244, 2), (289, 2), (244,), (289,))

In [10]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

# Min df - minimo de vezes que palavra tem que aparecer pra virar coluna
title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [11]:
title_bow_train.shape, title_bow_val.shape

((244, 235), (289, 235))

In [12]:
# Concatenando as variávels numéricas com as geradas pelo TfidfVectorizer
Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])

Xtrain_wtitle.shape, Xval_wtitle.shape

((244, 237), (289, 237))

In [13]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [14]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [15]:
metrics.average_precision_score(yval, p)

0.4804983314767605

In [16]:
metrics.roc_auc_score(yval, p)

0.5979254390469344

### Aumentando o treino

In [17]:
mask_train = (df_limpo['date'] < '2020-03-10') 
mask_val = (df_limpo['date'] >= '2020-03-10') & (df_limpo['novo'] == 0)

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

# Min df - minimo de vezes que palavra tem que aparecer pra virar coluna
title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

# Concatenando as variávels numéricas com as geradas pelo TfidfVectorizer
Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])

Xtrain_wtitle.shape, Xval_wtitle.shape

mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

p = mdl.predict_proba(Xval_wtitle)[:, 1]

metrics.average_precision_score(yval, p) , metrics.roc_auc_score(yval, p)

(0.4927283761966597, 0.6091118421052631)

In [18]:
df3 = pd.read_csv("raw_data_sem_labels.csv", index_col=0)
df3.shape

(1475, 14)

In [19]:
df1.shape[0] + df2.shape[0]

599

In [20]:
df3_1 = df3[~df3['title'].isin(list(df['title'].unique()))]
df3_1.shape

(874, 14)

In [21]:
df_reshaped = pd.concat([df1, df2.drop(['p', 'novo'], axis=1), df3_1])
df_reshaped.shape


(1473, 15)

In [22]:
df_reshaped['title'].value_counts()

Kaggle Challenge (LIVE)                                                                                 3
Q Learning Algorithm and Agent  - Reinforcement Learning p.2                                            2
Kaggle Reading Group: Generating Long Sequences with Sparse Transformers (Part 2) | Kaggle              2
Introducing the Next Generation Data Science Workspace | Keynote Spark + AI Summit 2020                 2
Jeremy Howard: fast.ai Deep Learning Courses and Research | Lex Fridman Podcast #35                     2
                                                                                                       ..
Submitting Predictions to Kaggle Competitions                                                           1
Supervised Machine Learning : Linear Regression - 2 | Lec 19 | Machine Learning | GATE CSE 2021 Exam    1
Data Cleaning | Kaggle Weekly | House Prices: Advanced Regression Techniques | Part 1                   1
Powered by TensorFlow: Airbnb uses machine lea

In [23]:
df_reshaped.drop_duplicates(keep='first', subset=['title'], inplace=True)

In [24]:
df_reshaped['title'].value_counts()

AI Projects #2 Showcase - Project: Earthquake Prediction Kaggle Challenge                               1
Learn Data Science Online!                                                                              1
How to Build A Data Science Portfolio That Can Get You Jobs?                                            1
[LIVE] Criando um Dashboard para Data Science                                                           1
Covid19  Data Analysis | Python | Data Science                                                          1
                                                                                                       ..
Fired Florida data scientist speaks out as COVID-19 cases spike                                         1
SQL Summer Camp: Select, From and Where| Kaggle                                                         1
Neural Network Architectures and Deep Learning                                                          1
Hill Climbing Algorithm | Hill Climbing in Art

In [25]:
df_reshaped.to_csv("raw_data_all_labeled2.csv")