In [1]:
import spacy
import yaml
import re
import json

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from tweepy import OAuthHandler
from tweepy.api import API as Twitter
from tweepy.error import TweepError
from tweepy.parsers import JSONParser

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f6b2e676f90>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6b2e39fb40>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6b2e39fbb0>)]

In [7]:
with open('twitter_api_creds.json', "rb") as f:
    conf = json.load(f)

In [8]:
conf

{'twitter': {'api': {'pause': 1,
   'app': {'key': 'X9JqOAFJ8GjYNqvEolEpvZdE2',
    'secret': '4j5TdZC2o5YfSFnlASa10x9KLE3FN4P5FTOwHLSrx8FYYem7c1',
    'token': '2620986611-pOP0uh0cEkgh8yxmJdNEpK3kD4DIKTSJvmKJxR6',
    'token_secret': 'yslRiqrk6EriZxFmeWAkJItWOD9PtdVOLe3h5IaoskRVY'}}}}

In [9]:
auth = OAuthHandler(conf["twitter"]["api"]["app"]["key"], conf["twitter"]["api"]["app"]["secret"])
auth.set_access_token(conf["twitter"]["api"]["app"]["token"], conf["twitter"]["api"]["app"]["token_secret"])
twitter = Twitter(auth, parser=JSONParser())

In [10]:
results = twitter.search('#GoT', lang='en')

In [11]:
len(results['statuses'])

15

In [12]:
#http_regexp = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
http_regexp = 'https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}A'

In [13]:
re.findall(http_regexp,'I miss the old Tyrion #GoT #GameofThrones https://t.co/YbBI6vkzYh https://t.co/j6XbRWYPqN' )

['https://t.co/YbBI6vkzYh', 'https://t.co/j6XbRWYPqN']

In [14]:
pattern = re.compile(http_regexp)

In [15]:
pattern.sub('','I miss the old Tyrion #GoT #GameofThrones https://t.co/YbBI6vkzYh https://t.co/j6XbRWYPqN')

'I miss the old Tyrion #GoT #GameofThrones  '

In [16]:
def remove_urls(text):
    
    return pattern.sub('',text)

def remove_hashes(text):
    
    return text.replace('#','')

def remove_newline(text):
    return text.replace('\n',' ')

In [17]:
def clean_tweets(tweets):
    out_list=[]

    for tweet in tweets['statuses']:
        text= tweet['text']
        url_clean = remove_urls(text)
        hash_clean = remove_hashes(url_clean)
        newline_clean = remove_newline(hash_clean)
        out_list.append(newline_clean)
        
    return out_list

In [18]:
cleaned_texts = clean_tweets(results)

In [19]:
cleaned_texts

['RT @chriskane0: @BuzzFeed Wonder what @HBO didn’t like in the Pilot episode. Was it the backlash of the Game Of Thrones final season  gotp…',
 'RT @afterbuzztv: HBO has cancelled plans for a GameOfThrones prequel starring NaomiWatts. The network shot a pilot episode in Northern Ir…',
 "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to lengthy post-production &amp; 'issues during filming'  • Targaryen…",
 "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to lengthy post-production &amp; 'issues during filming'  • Targaryen…",
 'StarWars and GOT prequel BOTH canceled suddenly?   What did Benioff or Weiss do? ',
 "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to lengthy post-production &amp; 'issues during filming'  • Targaryen…",
 'Ppl hatin on D&amp;D, Pretty sure they will b ok, didnt they sign a 9 figure deal with netflix? if dont like them dont… ',
 'People are fickle and @HBO needs to get something in pro

In [20]:
for sentence in cleaned_texts:
    doc = nlp(sentence)
    entities = doc.ents
    if len(entities) > 0:
        output = dict(
            sentence=sentence,
            entities=entities
        )
        print(output)

{'sentence': 'RT @chriskane0: @BuzzFeed Wonder what @HBO didn’t like in the Pilot episode. Was it the backlash of the Game Of Thrones final season  gotp…', 'entities': (@HBO, the Game Of Thrones)}
{'sentence': 'RT @afterbuzztv: HBO has cancelled plans for a GameOfThrones prequel starring NaomiWatts. The network shot a pilot episode in Northern Ir…', 'entities': (HBO, NaomiWatts, Northern Ir)}
{'sentence': "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to lengthy post-production &amp; 'issues during filming'  • Targaryen…", 'entities': (Targaryen,)}
{'sentence': "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to lengthy post-production &amp; 'issues during filming'  • Targaryen…", 'entities': (Targaryen,)}
{'sentence': 'StarWars and GOT prequel BOTH canceled suddenly?   What did Benioff or Weiss do? ', 'entities': (StarWars, GOT, Benioff, Weiss)}
{'sentence': "RT @notamoviepod: GoT update ❄️ 🔥   • Age of Heroes prequel canceled due to

### Model

In [21]:
import pandas as pd

In [22]:
from fastai.text import load_data, text_classifier_learner, AWD_LSTM
import torch
from pathlib import Path, PosixPath, PurePosixPath

In [23]:
torch.cuda.get_device_name(0)

'GeForce RTX 2070 SUPER'

In [38]:
path=PurePosixPath('./model/sentiment')
path

PurePosixPath('model/sentiment')

In [39]:
path/'data_clas.pkl'

PurePosixPath('model/sentiment/data_clas.pkl')

In [40]:
bs=48
#path=Path('c:/Users/jjc/Projects/pydata2019/pydata2019-nlp-system/step3_nlp/model/sentiment/')
data_clas = load_data(path, 'data_clas.pkl', bs=bs)
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)

NotImplementedError: cannot instantiate 'PosixPath' on your system