In [75]:
import spacy
import pandas as pd
import numpy as np
from spacy.lang.en import English
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
import string
import re
import nltk
from nltk.stem.porter import *
from wordcloud import WordCloud
import seaborn as sns

In [2]:
!pip install seaborn



In [3]:
!pip install wordcloud



In [76]:
stopwords = list(STOP_WORDS)

In [77]:
punctuations = string.punctuation.replace("#", "")
parser = English()

In [78]:
nlp = en_core_web_sm.load()
print(stopwords)

['whose', 'every', 'without', 'yourself', 'hereafter', '‘m', 'two', 'mostly', 'seem', 'therefore', "'re", 'alone', 'across', 'that', 'becomes', 'we', 'ten', 'anything', 'if', 'put', 'been', 'moreover', 'whom', 'would', 'still', 'elsewhere', 'many', 'per', 'them', 'while', 'own', "n't", 'thru', '’m', 'as', 'more', 'whereafter', '‘d', 'see', 'his', 'anywhere', 'off', 'get', 'name', 'until', 'enough', 'what', '‘ll', 're', 'full', 'former', 'whatever', 'but', 'toward', 'only', "'s", 'hereupon', 'whither', 'twelve', 'hereby', 'just', 'ca', 'somehow', 'do', 'front', 'my', 'various', 'less', 'who', 'anyhow', 'therein', 'well', "'ve", 'everywhere', 'yours', 'hence', 'those', 'amount', 'except', 'nothing', 'is', 'bottom', 'go', 'sometime', 'several', 'seems', 'whereby', 'cannot', 'three', 'nowhere', 'us', 'amongst', 'whole', 'itself', 'part', 'before', 'on', 'take', 'very', 'upon', 'keep', '’ll', 'anyway', 'another', 'against', "'m", 'once', 'should', 'thus', 'there', 'herself', 'below', 'one',

In [79]:
# Stemming
stemmer = PorterStemmer()

In [80]:
print(punctuations)

!"$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [81]:
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    # Lower case all words and strip white spaces
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    # Remove all stop words and punctuations
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    return tokens

In [82]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [83]:
def tokenize_tweet(tweet):
    tokenized_tweet = [t.text for t in nlp(tweet)]
    
    tokenized_tweet = [stemmer.stem(t) for t in tokenized_tweet]
    
    tokenized_tweet = ' '.join(tokenized_tweet)
    
    return tokenized_tweet

In [84]:
phraseTerms = [
        u"windows 10",
        u"big data",
        u"data analytics"
    ]

In [110]:

def clean_sentence(sentence):
    doc = nlp(sentence)
    
    # Phrase matchers
    phraseMatcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    phrasePatterns = [nlp.make_doc(text) for text in phraseTerms]
    
    phraseMatcher.add("Names", None, *phrasePatterns)
    
    for match_id, start, end in phraseMatcher(doc):
        span = doc[start:end]
        sentence = sentence.replace(span.text, ''.join(token.text for token in span))
    
    # Token matchers
    matcher = Matcher(nlp.vocab)
    
    # Create matcher to detect urls
    pattern = [{ "LIKE_URL": True }]
    matcher.add("UrlDetection", None, pattern)
    
    # Create matcher to detect ...
    pattern = [{ "TEXT": "..." }]
    matcher.add("MoreDotsDetection", None, pattern)
    
    matches = matcher(doc)
    
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        sentence = sentence.replace(span.text, '')
    
    # Tokenize sentence and join
    sentence = ' '.join(str(token) for token in spacy_tokenizer(sentence))
    
    # Remove twitter handles
    sentence = remove_pattern(sentence, "@[\w]*")
    
    sentence = '#'.join([phrase for phrase in [e[1:] for e in (' ' + sentence).split("#")]])
    
    # remove words with length less than 3 and not #
    sentence = ' '.join([word for word in sentence.split() if word[0] == '#' or len(word)>3])
    
    return sentence

clean_sentence("The latest Logan's DX Daily #AD (Cloud, AI/ML, Analytics & IoT)! https://t.co/Ac3cKz73Gx Thanks to @JD_Corporate... https://t.co/OOyzLIPxA2")

'late logan daily #ad cloud analytics thanks'

In [111]:
df = pd.read_csv("data/flair_labeled_sentiments_long.csv", names=['feedback', 'text'])
df.head()

Unnamed: 0,feedback,text
0,__label__Negative,Big Data Is Dead. Long Live Big Data AI #Machi...
1,__label__Negative,Data Analytics 'Performance Gap' Destroying Cu...
2,__label__Negative,Big Data Is Dead. Long Live Big Data AI. - htt...
3,__label__Negative,Big Data and the Problem of Bias in Higher Edu...
4,__label__Positive,Using Twitter for big data analytics to analyz...


In [112]:
df['feedback'] = df['feedback'].replace('__label__Negative', 0)
df['feedback'] = df['feedback'].replace('__label__Positive', 1)

In [113]:
df.head()

Unnamed: 0,feedback,text
0,0,Big Data Is Dead. Long Live Big Data AI #Machi...
1,0,Data Analytics 'Performance Gap' Destroying Cu...
2,0,Big Data Is Dead. Long Live Big Data AI. - htt...
3,0,Big Data and the Problem of Bias in Higher Edu...
4,1,Using Twitter for big data analytics to analyz...


In [114]:
df = df.head(2000)
df['clean_text'] = np.vectorize(clean_sentence)(df['text'])

In [115]:
df.head()

Unnamed: 0,feedback,text,clean_text
0,0,Big Data Is Dead. Long Live Big Data AI #Machi...,bigdata dead long live bigdata #machinelearnin...
1,0,Data Analytics 'Performance Gap' Destroying Cu...,dataanalytics performance destroying customer ...
2,0,Big Data Is Dead. Long Live Big Data AI. - htt...,bigdata dead long live bigdata #bigdata #dataa...
3,0,Big Data and the Problem of Bias in Higher Edu...,bigdata problem bias higher education #bigdata...
4,1,Using Twitter for big data analytics to analyz...,twitter bigdataanalytics analyze disaster #soc...


In [116]:
# Remove duplicates
df = df.drop_duplicates(subset=['clean_text'], keep='first')

In [117]:
df.head(10)

Unnamed: 0,feedback,text,clean_text
0,0,Big Data Is Dead. Long Live Big Data AI #Machi...,bigdata dead long live bigdata #machinelearnin...
1,0,Data Analytics 'Performance Gap' Destroying Cu...,dataanalytics performance destroying customer ...
2,0,Big Data Is Dead. Long Live Big Data AI. - htt...,bigdata dead long live bigdata #bigdata #dataa...
3,0,Big Data and the Problem of Bias in Higher Edu...,bigdata problem bias higher education #bigdata...
4,1,Using Twitter for big data analytics to analyz...,twitter bigdataanalytics analyze disaster #soc...
7,1,Azure Big Data Analytics Platform Databricksto...,azure bigdataanalytics platform databrickstoka...
8,1,DZone >> Automate or Die: A Dramatic State of ...,dzone automate dramatic state affairs 21st cen...
9,1,Big data in a nutshell https://t.co/HxdnUe7IQI...,bigdata nutshell overview bigdata #crm fraud d...
10,1,Tired of your current job? Check this out! => ...,tired current check bigdata application develo...
11,1,Big Data And The Problem Of Bias In Higher Edu...,bigdata problem bias higher education explosiv...


In [118]:
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [119]:
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        # load existing spaCy model
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        # Create blank Language class
        nlp = spacy.blank('en')
        print("Created blank 'en' model")

    # Add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # Otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # Add label to text classifier
    textcat.add_label('POSITIVE')

    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # Get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    
    # Only train textcat by disabling other pipes
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('No.', 'LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # Batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            
            with textcat.model.use_params(optimizer.averages):
                # Evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'  # print a simple table
                  .format(i, losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [120]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [96]:
def load_data(limit=0, split=0.8):
    # Partition off part of the train data for evaluation
    cats = [{'POSITIVE': bool(y)} for y in df['feedback']]
    split = int(len(df['clean_text'].values) * split)
    return (df['clean_text'][:split], cats[:split]), (df['clean_text'][split:], cats[split:])

In [97]:
main(model=None, output_dir="text_cnn_models/model", n_iter=20, n_texts=1892)

Created blank 'en' model
Using 1894 examples (1499 training, 375 evaluation)
Training the model...
 No. \LOSS 	  P  	  R  	  F  
0	4.506	1.000	1.000	1.000
1	4.053	1.000	1.000	1.000
2	3.255	1.000	0.989	0.995
3	2.333	1.000	0.984	0.992
4	1.788	1.000	0.965	0.982
5	1.238	1.000	0.944	0.971
6	1.088	1.000	0.931	0.964
7	0.725	1.000	0.925	0.961
8	0.637	1.000	0.925	0.961
9	0.566	1.000	0.941	0.970
10	0.479	1.000	0.931	0.964
11	0.603	1.000	0.944	0.971
12	0.492	1.000	0.947	0.973
13	0.371	1.000	0.947	0.973
14	0.563	1.000	0.949	0.974
15	0.257	1.000	0.936	0.967
16	0.353	1.000	0.928	0.963
17	0.379	1.000	0.931	0.964
18	0.308	1.000	0.928	0.963
19	0.167	1.000	0.931	0.964
Saved model to text_cnn_models\model


In [98]:
model = 'text_cnn_models/model'

nlp = spacy.load(model)

def predict_test(text):
    test_text = clean_sentence(text)
    doc = nlp(test_text)
    print(test_text, doc.cats)

In [99]:
predict_test("This is why I always hated working with #Windows. No, I was not missing it at all in my current job in #RedHat.... https://t.co/1gZlVbPwHH")

hate work #windows miss current #redhat .... {'POSITIVE': 0.9962859153747559}


In [100]:
predict_test("RT MakeUseOf: hate using it on your wireless ro... https://t.co/JLuKKhUC96")

makeuseof hate wireless {'POSITIVE': 0.9977158308029175}


In [101]:
predict_test("Want to win 50000 DOGECOINS Free? Register with only 2 clicks! https://t.co/CgpuxohQDe - #crypto #bitcoin #giveaway... https://t.co/ieTGdXvI2D")

want 50000 dogecoins free register click #crypto #bitcoin #giveaway {'POSITIVE': 0.9994334578514099}


In [102]:
predict_test("RT @bruntmor: Enter to #win Bruntmor's Enameled 12-Inch Deep Saute Pan in Any Color! #follow #retweet Winner Announced September 5th, USA O...")

enter #win bruntmor enameled 12-inch deep saute color #follow #retweet winner announced september {'POSITIVE': 0.9791330695152283}


In [103]:
predict_test("Dead End Job #Windows #Mac #Linux <Headup> ga, 2020Nian Pei Xin niYan Qi saremashita. https://t.co/d8Auyfps9F")

dead #windows #mac #linux headup 2020nian niyan saremashita {'POSITIVE': 0.9976792931556702}


In [104]:
predict_test("RT @OriginalFunko: Gears of War meets Funko is available for download now on iOS, Android and Windows 10 PC! Have you playe...")

gears meet funko available download android windows10 playe {'POSITIVE': 0.9999101161956787}


In [105]:
predict_test("Windows 10 update: #Microsoft could change this iconic feature and some fans aren't happy #operatingsystem...... https://t.co/wuquSEakzH")

windows10 update #microsoft change iconic feature happy #operatingsystem ...... {'POSITIVE': 0.9956941604614258}


In [106]:
predict_test("@vzverovich SuSE linux 6.1. I didn't understand anything about it and destroyed my Windows10 partition in the process.")

suse linux understand destroy windows10 partition process {'POSITIVE': 0.024774516001343727}


In [107]:
predict_test("freezing windows 10")

freeze windows10 {'POSITIVE': 0.9970779418945312}


In [108]:
predict_test("RT RedHatSupport: Choosing the right data storage is complicated. Learn more about block storage, file storage, and... https://t.co/zJOVbQh5sa")

redhatsupport choosing right datum storage complicate learn block storage file storage {'POSITIVE': 0.0010454102884978056}


In [109]:
predict_test("@RedHat Red Hat = MAGA = A symbol of hate & racism. Bad branding. Unless your company is pro-Trump?")

maga symbol hate racism brand company trump {'POSITIVE': 0.12466123700141907}
