In [5]:
import sys
sys.path.append('/home/ego/Github/david/')

import os
from os.path import exists, join, isfile
from collections import namedtuple

import pandas as pd

from david.youtube.scraper import _write2json
from david.pipeline import TextMetrics, TextPreprocess

In [None]:
def download(video_id: str, dirpath='downloads',
             limit=None, load_corpus=False, force_download=False):
    '''
    Downloads comments from a youtube videos.

    NOTE: The files are named after the video id e.g. 4Dk3jOSbz_0.json

    Parameters
    ----------

    `video_id` : (str)
        The youtube id from a video url. For example, '4Dk3jOSbz_0'

    `dirpath` : (str, default='downloads')

    NOTE: I NEED TO ADD A DEFAULT GLOBAL ENVIROMENT VARIABLE TO THE
    LOCATION WHERE THE DOWNLOADS WILL BE SAVED. LIKE DAVID_DATA. I
    ALREADY STARTED WORKING ON THIS WILL SOME METHODS FROM THE SKLEARN
    LIBRARY. LOCATED IN THE MODELS DIRECTORY OF THIS PROJECT.


        The directory where the downloads will be saved.

    `limit` : (int, default=None)
        Sets a limit to the number of comments to download
    '''
    # make directory if it doesnt exist.
    if not exists(dirpath):
        os.makedirs(dirpath)
    fp = join(dirpath, f'{video_id}.json')

    # check if the video has already beed downloaded.
    if (isfile(fp) and not force_download):
        raise Exception(f'The video id: {video_id} already exists!\n'
                        'Set force_download=True to override the exception.')
    _write2json(fp, video_id, limit)

    # if true, return the full path of the scraped file.
    if load_corpus:
        return fp

In [None]:
# the error is intentional, it saves you from re-downloading 
# and corrupting the corpus (1,000,000 comments video?)

download('4Dk3jOSbz_0', limit=100)

In [None]:
# set force_download=True, to re-download the comments again.

download('4Dk3jOSbz_0', limit=100, force_download=True)

In [2]:
metric = TextMetrics('downloads/4Dk3jOSbz_0.json')
metric.info()

<class 'david.pipeline._metric.TextMetrics'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
author    100 non-null object
cid       100 non-null object
text      100 non-null object
time      100 non-null object
dtypes: object(4)
memory usage: 3.2+ KB


In [3]:
metric.index

RangeIndex(start=0, stop=100, step=1)

In [3]:
metric.get_all_metrics(sentiment=True, gettags=True)
metric['text'].head()

0    2016 Election put business person  in govermen...
1    I was so proud of my country the land of the A...
2    From some news blasts I've seen some of Donald...
3    Impeachment cannot come soon enough for the Li...
4    What has become of America the great! You have...
Name: text, dtype: object

### simple pipeline configuration:

* *downloads the comments to a json file.*

* *automatically loads the corpus to the TextMetrics instance with the `load_corpus=True` parameter.*

> modules used for this pipeline.

```python
from david.youtube import scraper
from david.pipeline import TextMetrics
```
___

**Since the `TextMetrics` class inherits the `pandas.DataFrame` class, all the conventional methods are accessible right-away, without having to import pandas  itself (a common pattern when performing analysis).**

In [4]:
metric2 = TextMetrics('downloads/BmYZH7xt8sU.json')
metric2.info()

<class 'david.pipeline._metric.TextMetrics'>
RangeIndex: 4252 entries, 0 to 4251
Data columns (total 4 columns):
author    4252 non-null object
cid       4252 non-null object
text      4252 non-null object
time      4252 non-null object
dtypes: object(4)
memory usage: 133.0+ KB


In [5]:
metric2.head()

Unnamed: 0,author,cid,text,time
0,PNW Ryan,UgzaG3oJa98fF6qi32h4AaABAg,I always forget to like the vid :( remind us!!!:P,25 minutes ago
1,Brian McDonald,Ugwgnpcp2e1D3mWMX2p4AaABAg,Looked like there was oil coming down left sid...,2 hours ago
2,Lewis Harvey,UgzpJXjwsyxTe1CLe4R4AaABAg,i like videos if its something really really f...,5 hours ago
3,skip rose,UgwBlsLv64UrNOcONz54AaABAg,Pos cable to small of guage causing currant su...,12 hours ago
4,Price Check On VagiClean,UgzjoHUlBJ8PDM_I3Gx4AaABAg,GT3RS: One of the best track cars ever made. Y...,12 hours ago


In [6]:
# this method runs all the methods for stats.
# (it normalizes white spacing for better accuracy)

metric2.get_all_metrics(sentiment=True, gettags=True)
metric2['text'].head()

0    I always forget to like the vid :( remind us!!!:P
1    Looked like there was oil coming down left sid...
2    i like videos if its something really really f...
3    Pos cable to small of guage causing currant su...
4    GT3RS: One of the best track cars ever made. Y...
Name: text, dtype: object

In [7]:
metric2.describe()

Unnamed: 0,stringLength,avgWordLength,isStopwordCount,noStopwordCount,charDigitCount,charUpperCount,charLowerCount,sentiPolarity,sentiSubjectivity
count,4252.0,4252.0,4252.0,4252.0,4252.0,4252.0,4252.0,4252.0,4252.0
mean,135.003998,4.73963,26.677328,14.556914,0.375118,3.729774,101.602305,0.152794,0.435873
std,153.628255,0.969159,29.328706,15.255229,1.159446,5.6859,116.379166,0.264156,0.303939
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0
25%,51.0,4.2,10.0,6.0,0.0,1.0,38.0,0.0,0.125
50%,95.0,4.703203,19.0,10.0,0.0,2.0,72.0,0.1,0.5
75%,170.0,5.2,34.0,18.0,0.0,4.0,128.0,0.322247,0.65
max,3240.0,18.0,592.0,297.0,18.0,118.0,2477.0,1.0,1.0


In [8]:
# the slicing the method must return it to a new pointer (variable)
# just in case you did't like the results :)

sliced_metric2 = metric2.slice_dataframe(by_setvalue=20)
sliced_metric2.describe()

Unnamed: 0,stringLength,avgWordLength,isStopwordCount,noStopwordCount,charDigitCount,charUpperCount,charLowerCount,sentiPolarity,sentiSubjectivity
count,3984.0,3984.0,3984.0,3984.0,3984.0,3984.0,3984.0,3984.0,3984.0
mean,143.263805,4.759653,28.295432,15.395331,0.389809,3.903614,107.848645,0.156686,0.452224
std,155.257378,0.850154,29.603868,15.399821,1.183579,5.820673,117.621503,0.263989,0.297304
min,21.0,1.666667,3.0,1.0,0.0,0.0,0.0,-1.0,0.0
25%,58.0,4.25,12.0,7.0,0.0,1.0,43.0,0.0,0.216667
50%,102.0,4.74037,20.0,11.0,0.0,2.0,77.0,0.11849,0.5
75%,177.0,5.214286,35.0,19.0,0.0,5.0,133.0,0.332708,0.65
max,3240.0,18.0,592.0,297.0,18.0,118.0,2477.0,1.0,1.0


In [72]:
from david.utils import pointer

cols = sliced_metric2.columns
features = {
    'corpus_cols': cols[:4],
    'words': cols[4:8],
    'chars': cols[8:11],
    'sentiment': cols[11:14],
    'tags': cols[14:],
    'all_metrics': cols[4:18],
    'all_text': cols[4:11]
}

Features = pointer('Features', features)
Features.__doc__

'Features(corpus_cols, words, chars, sentiment, tags, all_metrics, all_text)'

In [84]:
sliced_metric2[Features.words].mean()

stringLength       143.263805
avgWordLength        4.759653
isStopwordCount     28.295432
noStopwordCount     15.395331
dtype: float64

In [85]:
sliced_metric2[Features.sentiment].std()

sentiPolarity        0.263989
sentiSubjectivity    0.297304
dtype: float64

In [13]:
# youtube features:
emojis_found = sliced_metric2['authorEmoji']

print(f'{ emojis_found.unique() }\n\n{ emojis_found.describe() }')

['' '😂😂' '👌' '👍' '😅' '😂' '🤣🤣🤣' '😩' '🤣' '🤷♂' '🤗' '😒' '😎' '\U0001f9d0💪👍😂'
 '👏🤔' '👊😁' '\U0001f92f👍' '🤙' '😍👍' '😀' '😉' '🏁🏁👍👍👍' '👍😈' '🤧' '🤷♂🤷♂' '👌✌'
 '😅😂' '👍👍👍' '😆' '👌👍' '😁😁😉' '❤' '😂😂😂' '🤷♀' '💪' '😬😬😬' '😭😭😂😂😂' '😂💯👍🤘' '💪👌'
 '🔥' '😍' '🙌' '😄😉' '🙄' '😅😅👌' '😁👍' '🤘👊' '🤞🤣' '😂😉😂😂' '👍✌' '👌😅' '😆😁' '😭😭😢❤'
 '💦🤣' '🙏' '👍👍🤜🤛😎🔰🔰🔰💪' '🙋♂' '😂😂😂😂' '😜' '💁♀😁' '😁' '👍😊' '😛' '😘' '❤❤' '😆😆😆😆😆'
 '👍👍👍🤘' '🤦♂' '♥' '🤙🤙🤙' '🤣🤣' '😂🤙👌' '🤷♂😅' '💯' '👍😂' '☺' '🤘' '👀' '🤔✊' '🙃🙃🙃🙃'
 '😑' '🤔' '🤣🤙' '👊' '💪💪✌🤙🤘💪' '✌' '🙂' '😉😘' '💀🤷♂' '😏' '😥😥😥😥😥' '👍👍👍👍👍👍👍👍👍👍'
 '👍👍' '😹😹😹' '😂😆' '😄😂' '😥' '🤣👍' '💪💪😎😎' '👍😆' '\U0001f91f' '🤘🤠' '🤣👍👍' '💕'
 '😎👍\U0001f91f' '👍😉' '😊' '😇' '👌😂🤣' '🤦♂😂' '👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍' '😼' '😂😁👍' '🤘😁'
 '🔥🔥🔥💪💯❤' '🏎' '🐶' '👍👍👍👍👍' '🙌🙌' '😎😉🤔✊😘' '🤙🤙😉' '😉😂👌' '😉😉' '✌👍' '😂👍👍👍👍👍👍👍👍😎'
 '✋' '😍😍😍' '🙃' '💪💪' '💙' '❤🏴' '👌😎' '😍😂😂😂😂😂' '😢' '😭😭😭' '😁😁😁' '💪👌🤜🤛' '🔰' '😏👍'
 '😁👌👍' '😁😁' '🤷♂👍' '👋🔥' '🖤' '👏' '❤😊' '🤘🤙' '😍😍' '😤' '🤷♂🤦♂' '😂🔥' '❤🙏']

count     3984
unique     152
top           
freq      3614
Name: authorEmoji, dtype: object


In [14]:
sliced_metric2.sentimentLabel.describe()

count         3984
unique           3
top       positive
freq          2372
Name: sentimentLabel, dtype: object

In [15]:
# for simplicity the path is saved under JSON_PATH

metric2.JSON_PATH

'downloads/BmYZH7xt8sU.json'

In [16]:
# so we can just use it like this and keep track of the dataset
# we are working on. without saving, loading, back and forward.

prep = TextPreprocess(metric2.JSON_PATH)
prep.info()

<class 'david.pipeline._prep.TextPreprocess'>
RangeIndex: 4252 entries, 0 to 4251
Data columns (total 4 columns):
author    4252 non-null object
cid       4252 non-null object
text      4252 non-null object
time      4252 non-null object
dtypes: object(4)
memory usage: 133.0+ KB


In [17]:
from david.pipeline.utils import timeit
# theres is also some utility tools for testing

@timeit
def funcprep():
    # these paramaters are always set to false
    prep.clean_all_text(lemmatize=False, lower_text=False)
funcprep()

( 'funcprep' ) took: 2000.11 ms


In [20]:
# after preprocessing
prep.text.head(10)

0          I always forget to like the vid remind us P
1    Looked like there was oil coming down left sid...
2    i like videos if its something really fuckin cool
3    Pos cable to small of guage causing currant surge
4    GT RS One of the best track cars ever made Yet...
5    I try and like every video I watch but sometim...
6    I literally like every video right after I cli...
7          I like for the mustache Adam got to love it
8    I like videos of content creators I truly supp...
9    There are certain channels I follow that get a...
Name: text, dtype: object

In [21]:
# before preprocessing, also notice how we are not loading and saving... life is easy :)
metric2.text.head(10)

0    I always forget to like the vid :( remind us!!!:P
1    Looked like there was oil coming down left sid...
2    i like videos if its something really really f...
3    Pos cable to small of guage causing currant su...
4    GT3RS: One of the best track cars ever made. Y...
5    I try and like every video I watch but sometim...
6    I literally like every video right after I cli...
7        I like for the mustache Adam gotta love it 😂😂
8    I like videos of content creators I truly supp...
9    There are certain channels I follow that get a...
Name: text, dtype: object

> closer look at the results:

* before: 7 - I like for the mustache Adam `got to love it`

* after: 7 - I like for the mustache Adam `gotta love it` 😂😂

In [23]:
metric2[Features.tags].count()

authorTimeTag      21
authorUrlLink       3
authorHashTag      17
authorEmoji      4252
dtype: int64

In [62]:
metric2['authorTimeTag'].unique()

array([nan, '12:40', '7:07', '12:30', '2:14', '8:03', '10:14', '12:42',
       '7:48', '10:24', '6:57', '12:28', '11:44', '9:22', '3:35', '10:30',
       '2:43', '2:25', '1:05', '3:36', '10:10'], dtype=object)

In [70]:
for url in metric2.authorUrlLink.unique(): print(url)

nan
https://www.youtube.com/watch?v=PmShNvBroiQ&t
https://www.youtube.com/watch?v=eHsShmm08c4
https://www.youtube.com/watch?v=2LUjtPyYiVM


In [86]:
metric2['authorHashTag'].unique()

array([nan, '#remindthestonerstolikethevideos', '#spillthebeans',
       '#LZSpec', '#justmytwocents', '#giveadamlikes', '#1', '#nohomo',
       '#boobsforlikes', '#yearoftheflex', '#Brag', '#mexico', '#2',
       '#LikeFlex2019', '#family', '#reallifeproblems', '#ADAMLZ'],
      dtype=object)