In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import os
import sys
import datetime as dt
import pandas as pd
from google.cloud import storage
import json
from dotenv import load_dotenv
load_dotenv()
import tempfile

In [3]:
dag_path = Path('/Users', 'calebcastleberry', 'Documents', 'GitHubRepos')
sys.path.append(dag_path.as_posix())

In [4]:
from airflow_dags.services.reddit_analysis.dags import sub_overview_node as son
from airflow_dags.services.reddit_analysis.dags import post_detail_node as pdn

In [5]:
try:
    from airflow.models import Variable
    reddit_client_id = Variable.get(
        'REDDIT_CLIENT_ID', default_var=os.environ.get('REDDIT_CLIENT_ID'))
    reddit_client_secret = Variable.get(
        'REDDIT_CLIENT_SECRET', default_var=os.environ.get('REDDIT_CLIENT_SECRET'))
    reddit_user_agent = Variable.get(
        'REDDIT_USER_AGENT', default_var=os.environ.get('REDDIT_USER_AGENT'))
    google_storage_bucket_name = Variable.get(
        'GOOGLE_STORAGE_BUCKET_NAME',
        default_var=os.environ.get('GOOGLE_STORAGE_BUCKET_NAME')
    )
except:
    reddit_client_id = os.environ.get('REDDIT_CLIENT_ID')
    reddit_client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
    reddit_user_agent = os.environ.get('REDDIT_USER_AGENT')
    google_storage_bucket_name = os.environ.get('GOOGLE_STORAGE_BUCKET_NAME')

In [6]:
subreddit = 'destinythegame'
date = dt.date(year=2020, month=1, day=23)

In [7]:
blob_path = Path(
    'reddit_analysis',
    'comments',
    '7mys8m_comments.json'
).as_posix()
client = storage.Client()
bucket = client.bucket(google_storage_bucket_name)
json_blob = bucket.blob(blob_path)

In [8]:
comments = json.loads(json_blob.download_as_string())

In [9]:
df = pd.DataFrame.from_records(comments)

In [10]:
df.shape

(29, 9)

In [11]:
df.head(10)

Unnamed: 0,id,parent_prefix,parent_id,parent,body,score,level,post_id,created_ts
0,drxp6xb,t3,7mys8m,t3_7mys8m,It's just a box (!),65,1,7mys8m,1514599000.0
1,drxqbv0,t3,7mys8m,t3_7mys8m,!,50,1,7mys8m,1514601000.0
2,dry854l,t3,7mys8m,t3_7mys8m,"Kept you waiting, huh?",11,1,7mys8m,1514635000.0
3,dry4pj2,t3,7mys8m,t3_7mys8m,Solid post.,26,1,7mys8m,1514625000.0
4,dryb97i,t3,7mys8m,t3_7mys8m,An MTX to surpass all other MTX.,9,1,7mys8m,1514643000.0
5,dry98q5,t3,7mys8m,t3_7mys8m,New Dexterity skill gem for the next expansion...,10,1,7mys8m,1514638000.0
6,dry8aa1,t3,7mys8m,t3_7mys8m,You're that ninja..,5,1,7mys8m,1514636000.0
7,drynyd1,t3,7mys8m,t3_7mys8m,So this is what Cipher has been working on in ...,2,1,7mys8m,1514660000.0
8,drxyemf,t3,7mys8m,t3_7mys8m,Quality MGSpost.,2,1,7mys8m,1514612000.0
9,dry6n50,t3,7mys8m,t3_7mys8m,Lilly in the Box,2,1,7mys8m,1514631000.0


In [12]:
from textblob import TextBlob

In [13]:
tb_series = df['body'].apply(lambda x: TextBlob(x))

In [14]:
df['noun_phrases'] = tb_series.apply(lambda x: list(x.noun_phrases))

In [15]:
df['sentiment_polarity'] = tb_series.apply(lambda x: x.sentiment.polarity)

In [16]:
df['sentiment_subjectivity'] = tb_series.apply(lambda x: x.sentiment.subjectivity)

In [17]:
df.head(30)

Unnamed: 0,id,parent_prefix,parent_id,parent,body,score,level,post_id,created_ts,noun_phrases,sentiment_polarity,sentiment_subjectivity
0,drxp6xb,t3,7mys8m,t3_7mys8m,It's just a box (!),65,1,7mys8m,1514599000.0,[],0.0,1.0
1,drxqbv0,t3,7mys8m,t3_7mys8m,!,50,1,7mys8m,1514601000.0,[],0.0,0.0
2,dry854l,t3,7mys8m,t3_7mys8m,"Kept you waiting, huh?",11,1,7mys8m,1514635000.0,[kept],0.0,0.0
3,dry4pj2,t3,7mys8m,t3_7mys8m,Solid post.,26,1,7mys8m,1514625000.0,[],0.0,0.1
4,dryb97i,t3,7mys8m,t3_7mys8m,An MTX to surpass all other MTX.,9,1,7mys8m,1514643000.0,"[mtx, mtx]",-0.125,0.375
5,dry98q5,t3,7mys8m,t3_7mys8m,New Dexterity skill gem for the next expansion...,10,1,7mys8m,1514638000.0,"[dexterity, skill gem, camouflage debris, spel...",0.062338,0.472078
6,dry8aa1,t3,7mys8m,t3_7mys8m,You're that ninja..,5,1,7mys8m,1514636000.0,[],0.0,0.0
7,drynyd1,t3,7mys8m,t3_7mys8m,So this is what Cipher has been working on in ...,2,1,7mys8m,1514660000.0,"[cipher, africa]",0.0,0.0
8,drxyemf,t3,7mys8m,t3_7mys8m,Quality MGSpost.,2,1,7mys8m,1514612000.0,[quality mgspost],0.0,0.0
9,dry6n50,t3,7mys8m,t3_7mys8m,Lilly in the Box,2,1,7mys8m,1514631000.0,[lilly],0.0,0.0


In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_md')

In [20]:
spacy_series = df['body'].apply(lambda x: nlp(x))

In [21]:
spacy_series.apply(lambda x: [ent.text for ent in x.ents])

0                                      []
1                                      []
2                                      []
3                                      []
4                              [MTX, MTX]
5                              [Duration]
6                                      []
7                        [Cipher, Africa]
8                               [MGSpost]
9                            [Lilly, Box]
10                                     []
11                                     []
12                                     []
13    [Likly, Lilllllllllllyyyyyyyyyyyyy]
14                                 [Mech]
15                            [john cena]
16                                     []
17                                     []
18                                     []
19                                     []
20                                     []
21                                  [PoB]
22                                     []
23                                