In [90]:
import spacy
import numpy as np
import pandas as pd
import os
import random

path = os.path.join(os.getcwd(), "/Users/Ben/Desktop/Vital Strategies/Datasets/News_Category_Dataset_v2.json")

nlp = spacy.load("en_core_web_lg")

data = pd.read_json(path, lines=True)

SPLIT = int(.8*len(data))
train = data[:SPLIT]
test = data[SPLIT:]

categories = train.category.unique()
print(categories)
train.head(10)

['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'WELLNESS' 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE'
 'WEDDINGS' 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."
5,Ron Dicker,ENTERTAINMENT,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ..."
6,Ron Dicker,ENTERTAINMENT,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right."
7,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Amazon Prime That’s New This ...,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.
8,Andy McDonald,ENTERTAINMENT,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ..."
9,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Hulu That’s New This Week,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...


In [70]:
textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
nlp.add_pipe(textcat, last=True)

for label in train.category.unique():
    textcat.add_label(label)

In [116]:
#FORMAT: train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    
    len_of_data = len(train)
    
    idxs = list(range(len_of_data))
    
    ITERATIONS = 2
    SIZE = 100 #BATCH SIZE
    
    print("ESTIMATED TRAINING TIME:",(.002322*len_of_data*ITERATIONS/60),"mins\n")
    
    for i in range(ITERATIONS):
        print("epoch: ", i, "/", ITERATIONS)
        random.shuffle(idxs)
        
        #Batches:
        for batch in range(int(len_of_data/SIZE)):
            print("  batch: ",batch , "/", int(len_of_data/SIZE))
            annotations = []
            texts = []
            for idx in idxs[batch*SIZE:SIZE*(batch+1)]:
                
                temp = {category: category == train.category[idx] for category in categories}
                
                texts.append(train.headline[idx])
                annotations.append({"cats":temp})
                
            nlp.update(texts, annotations, sgd=optimizer)
    
print("Training Complete! Saving model...")
nlp.to_disk(os.path.join(os.getcwd(),"/Users/Ben/Desktop/Vital Strategies/trained models/newsClassifier"))

ESTIMATED TRAINING TIME: 12.436786799999998 mins

epoch:  0 / 2
  batch:  0 / 1606
  batch:  1 / 1606
  batch:  2 / 1606
  batch:  3 / 1606
  batch:  4 / 1606
  batch:  5 / 1606
  batch:  6 / 1606
  batch:  7 / 1606
  batch:  8 / 1606
  batch:  9 / 1606
  batch:  10 / 1606
  batch:  11 / 1606
  batch:  12 / 1606
  batch:  13 / 1606
  batch:  14 / 1606
  batch:  15 / 1606
  batch:  16 / 1606
  batch:  17 / 1606
  batch:  18 / 1606
  batch:  19 / 1606
  batch:  20 / 1606
  batch:  21 / 1606
  batch:  22 / 1606
  batch:  23 / 1606
  batch:  24 / 1606
  batch:  25 / 1606
  batch:  26 / 1606
  batch:  27 / 1606
  batch:  28 / 1606
  batch:  29 / 1606
  batch:  30 / 1606
  batch:  31 / 1606
  batch:  32 / 1606
  batch:  33 / 1606
  batch:  34 / 1606
  batch:  35 / 1606
  batch:  36 / 1606
  batch:  37 / 1606
  batch:  38 / 1606
  batch:  39 / 1606
  batch:  40 / 1606
  batch:  41 / 1606
  batch:  42 / 1606
  batch:  43 / 1606
  batch:  44 / 1606
  batch:  45 / 1606
  batch:  46 / 1606
  batc

  batch:  394 / 1606
  batch:  395 / 1606
  batch:  396 / 1606
  batch:  397 / 1606
  batch:  398 / 1606
  batch:  399 / 1606
  batch:  400 / 1606
  batch:  401 / 1606
  batch:  402 / 1606
  batch:  403 / 1606
  batch:  404 / 1606
  batch:  405 / 1606
  batch:  406 / 1606
  batch:  407 / 1606
  batch:  408 / 1606
  batch:  409 / 1606
  batch:  410 / 1606
  batch:  411 / 1606
  batch:  412 / 1606
  batch:  413 / 1606
  batch:  414 / 1606
  batch:  415 / 1606
  batch:  416 / 1606
  batch:  417 / 1606
  batch:  418 / 1606
  batch:  419 / 1606
  batch:  420 / 1606
  batch:  421 / 1606
  batch:  422 / 1606
  batch:  423 / 1606
  batch:  424 / 1606
  batch:  425 / 1606
  batch:  426 / 1606
  batch:  427 / 1606
  batch:  428 / 1606
  batch:  429 / 1606
  batch:  430 / 1606
  batch:  431 / 1606
  batch:  432 / 1606
  batch:  433 / 1606
  batch:  434 / 1606
  batch:  435 / 1606
  batch:  436 / 1606
  batch:  437 / 1606
  batch:  438 / 1606
  batch:  439 / 1606
  batch:  440 / 1606
  batch:  441

  batch:  785 / 1606
  batch:  786 / 1606
  batch:  787 / 1606
  batch:  788 / 1606
  batch:  789 / 1606
  batch:  790 / 1606
  batch:  791 / 1606
  batch:  792 / 1606
  batch:  793 / 1606
  batch:  794 / 1606
  batch:  795 / 1606
  batch:  796 / 1606
  batch:  797 / 1606
  batch:  798 / 1606
  batch:  799 / 1606
  batch:  800 / 1606
  batch:  801 / 1606
  batch:  802 / 1606
  batch:  803 / 1606
  batch:  804 / 1606
  batch:  805 / 1606
  batch:  806 / 1606
  batch:  807 / 1606
  batch:  808 / 1606
  batch:  809 / 1606
  batch:  810 / 1606
  batch:  811 / 1606
  batch:  812 / 1606
  batch:  813 / 1606
  batch:  814 / 1606
  batch:  815 / 1606
  batch:  816 / 1606
  batch:  817 / 1606
  batch:  818 / 1606
  batch:  819 / 1606
  batch:  820 / 1606
  batch:  821 / 1606
  batch:  822 / 1606
  batch:  823 / 1606
  batch:  824 / 1606
  batch:  825 / 1606
  batch:  826 / 1606
  batch:  827 / 1606
  batch:  828 / 1606
  batch:  829 / 1606
  batch:  830 / 1606
  batch:  831 / 1606
  batch:  832

  batch:  1169 / 1606
  batch:  1170 / 1606
  batch:  1171 / 1606
  batch:  1172 / 1606
  batch:  1173 / 1606
  batch:  1174 / 1606
  batch:  1175 / 1606
  batch:  1176 / 1606
  batch:  1177 / 1606
  batch:  1178 / 1606
  batch:  1179 / 1606
  batch:  1180 / 1606
  batch:  1181 / 1606
  batch:  1182 / 1606
  batch:  1183 / 1606
  batch:  1184 / 1606
  batch:  1185 / 1606
  batch:  1186 / 1606
  batch:  1187 / 1606
  batch:  1188 / 1606
  batch:  1189 / 1606
  batch:  1190 / 1606
  batch:  1191 / 1606
  batch:  1192 / 1606
  batch:  1193 / 1606
  batch:  1194 / 1606
  batch:  1195 / 1606
  batch:  1196 / 1606
  batch:  1197 / 1606
  batch:  1198 / 1606
  batch:  1199 / 1606
  batch:  1200 / 1606
  batch:  1201 / 1606
  batch:  1202 / 1606
  batch:  1203 / 1606
  batch:  1204 / 1606
  batch:  1205 / 1606
  batch:  1206 / 1606
  batch:  1207 / 1606
  batch:  1208 / 1606
  batch:  1209 / 1606
  batch:  1210 / 1606
  batch:  1211 / 1606
  batch:  1212 / 1606
  batch:  1213 / 1606
  batch:  

  batch:  1542 / 1606
  batch:  1543 / 1606
  batch:  1544 / 1606
  batch:  1545 / 1606
  batch:  1546 / 1606
  batch:  1547 / 1606
  batch:  1548 / 1606
  batch:  1549 / 1606
  batch:  1550 / 1606
  batch:  1551 / 1606
  batch:  1552 / 1606
  batch:  1553 / 1606
  batch:  1554 / 1606
  batch:  1555 / 1606
  batch:  1556 / 1606
  batch:  1557 / 1606
  batch:  1558 / 1606
  batch:  1559 / 1606
  batch:  1560 / 1606
  batch:  1561 / 1606
  batch:  1562 / 1606
  batch:  1563 / 1606
  batch:  1564 / 1606
  batch:  1565 / 1606
  batch:  1566 / 1606
  batch:  1567 / 1606
  batch:  1568 / 1606
  batch:  1569 / 1606
  batch:  1570 / 1606
  batch:  1571 / 1606
  batch:  1572 / 1606
  batch:  1573 / 1606
  batch:  1574 / 1606
  batch:  1575 / 1606
  batch:  1576 / 1606
  batch:  1577 / 1606
  batch:  1578 / 1606
  batch:  1579 / 1606
  batch:  1580 / 1606
  batch:  1581 / 1606
  batch:  1582 / 1606
  batch:  1583 / 1606
  batch:  1584 / 1606
  batch:  1585 / 1606
  batch:  1586 / 1606
  batch:  

  batch:  329 / 1606
  batch:  330 / 1606
  batch:  331 / 1606
  batch:  332 / 1606
  batch:  333 / 1606
  batch:  334 / 1606
  batch:  335 / 1606
  batch:  336 / 1606
  batch:  337 / 1606
  batch:  338 / 1606
  batch:  339 / 1606
  batch:  340 / 1606
  batch:  341 / 1606
  batch:  342 / 1606
  batch:  343 / 1606
  batch:  344 / 1606
  batch:  345 / 1606
  batch:  346 / 1606
  batch:  347 / 1606
  batch:  348 / 1606
  batch:  349 / 1606
  batch:  350 / 1606
  batch:  351 / 1606
  batch:  352 / 1606
  batch:  353 / 1606
  batch:  354 / 1606
  batch:  355 / 1606
  batch:  356 / 1606
  batch:  357 / 1606
  batch:  358 / 1606
  batch:  359 / 1606
  batch:  360 / 1606
  batch:  361 / 1606
  batch:  362 / 1606
  batch:  363 / 1606
  batch:  364 / 1606
  batch:  365 / 1606
  batch:  366 / 1606
  batch:  367 / 1606
  batch:  368 / 1606
  batch:  369 / 1606
  batch:  370 / 1606
  batch:  371 / 1606
  batch:  372 / 1606
  batch:  373 / 1606
  batch:  374 / 1606
  batch:  375 / 1606
  batch:  376

  batch:  720 / 1606
  batch:  721 / 1606
  batch:  722 / 1606
  batch:  723 / 1606
  batch:  724 / 1606
  batch:  725 / 1606
  batch:  726 / 1606
  batch:  727 / 1606
  batch:  728 / 1606
  batch:  729 / 1606
  batch:  730 / 1606
  batch:  731 / 1606
  batch:  732 / 1606
  batch:  733 / 1606
  batch:  734 / 1606
  batch:  735 / 1606
  batch:  736 / 1606
  batch:  737 / 1606
  batch:  738 / 1606
  batch:  739 / 1606
  batch:  740 / 1606
  batch:  741 / 1606
  batch:  742 / 1606
  batch:  743 / 1606
  batch:  744 / 1606
  batch:  745 / 1606
  batch:  746 / 1606
  batch:  747 / 1606
  batch:  748 / 1606
  batch:  749 / 1606
  batch:  750 / 1606
  batch:  751 / 1606
  batch:  752 / 1606
  batch:  753 / 1606
  batch:  754 / 1606
  batch:  755 / 1606
  batch:  756 / 1606
  batch:  757 / 1606
  batch:  758 / 1606
  batch:  759 / 1606
  batch:  760 / 1606
  batch:  761 / 1606
  batch:  762 / 1606
  batch:  763 / 1606
  batch:  764 / 1606
  batch:  765 / 1606
  batch:  766 / 1606
  batch:  767

  batch:  1106 / 1606
  batch:  1107 / 1606
  batch:  1108 / 1606
  batch:  1109 / 1606
  batch:  1110 / 1606
  batch:  1111 / 1606
  batch:  1112 / 1606
  batch:  1113 / 1606
  batch:  1114 / 1606
  batch:  1115 / 1606
  batch:  1116 / 1606
  batch:  1117 / 1606
  batch:  1118 / 1606
  batch:  1119 / 1606
  batch:  1120 / 1606
  batch:  1121 / 1606
  batch:  1122 / 1606
  batch:  1123 / 1606
  batch:  1124 / 1606
  batch:  1125 / 1606
  batch:  1126 / 1606
  batch:  1127 / 1606
  batch:  1128 / 1606
  batch:  1129 / 1606
  batch:  1130 / 1606
  batch:  1131 / 1606
  batch:  1132 / 1606
  batch:  1133 / 1606
  batch:  1134 / 1606
  batch:  1135 / 1606
  batch:  1136 / 1606
  batch:  1137 / 1606
  batch:  1138 / 1606
  batch:  1139 / 1606
  batch:  1140 / 1606
  batch:  1141 / 1606
  batch:  1142 / 1606
  batch:  1143 / 1606
  batch:  1144 / 1606
  batch:  1145 / 1606
  batch:  1146 / 1606
  batch:  1147 / 1606
  batch:  1148 / 1606
  batch:  1149 / 1606
  batch:  1150 / 1606
  batch:  

  batch:  1480 / 1606
  batch:  1481 / 1606
  batch:  1482 / 1606
  batch:  1483 / 1606
  batch:  1484 / 1606
  batch:  1485 / 1606
  batch:  1486 / 1606
  batch:  1487 / 1606
  batch:  1488 / 1606
  batch:  1489 / 1606
  batch:  1490 / 1606
  batch:  1491 / 1606
  batch:  1492 / 1606
  batch:  1493 / 1606
  batch:  1494 / 1606
  batch:  1495 / 1606
  batch:  1496 / 1606
  batch:  1497 / 1606
  batch:  1498 / 1606
  batch:  1499 / 1606
  batch:  1500 / 1606
  batch:  1501 / 1606
  batch:  1502 / 1606
  batch:  1503 / 1606
  batch:  1504 / 1606
  batch:  1505 / 1606
  batch:  1506 / 1606
  batch:  1507 / 1606
  batch:  1508 / 1606
  batch:  1509 / 1606
  batch:  1510 / 1606
  batch:  1511 / 1606
  batch:  1512 / 1606
  batch:  1513 / 1606
  batch:  1514 / 1606
  batch:  1515 / 1606
  batch:  1516 / 1606
  batch:  1517 / 1606
  batch:  1518 / 1606
  batch:  1519 / 1606
  batch:  1520 / 1606
  batch:  1521 / 1606
  batch:  1522 / 1606
  batch:  1523 / 1606
  batch:  1524 / 1606
  batch:  

In [95]:
from spacy.lang.en import English
nlp = spacy.load((os.path.join(os.getcwd(),
                                       "/Users/Ben/Desktop/Vital Strategies/trained models/newsClassifier")))

In [117]:
import re

def save_vectors(vectors):
    with open(os.path.join(os.getcwd(), "/Users/Ben/Desktop/Vital Strategies/trained models/nlpVectors/Asgn1.5vectors.tsv"), "w") as f:
        for vector in vectors:
            for feature in vector:
                f.write(str(feature))
                f.write("\t")
            f.write("\n")
            
    f.close()
def save_metadata(titles,labels):
    with open(os.path.join(os.getcwd(), "/Users/Ben/Desktop/Vital Strategies/trained models/nlpVectors/Asgn1.5metadata.tsv"), "w") as f:
        f.write("title\tcategory\n")
        for i, title in enumerate(titles):
            f.write(str(title))
            f.write("\t")
            f.write(labels[i])
            f.write("\n")
    f.close()

In [118]:
NUM_EXAMPLES_TO_SAVE = 1000

idxs = list(range(len(test)))
random.shuffle(idxs)
idxs = idxs[:NUM_EXAMPLES_TO_SAVE]

vectors = [] #vectors in 1D - How POSTIVE it thinks a title is
titles = []
labels = []

for idx in idxs:
    idx += SPLIT
    temp = test.headline[idx]
    doc = nlp(temp)
    vectors.append([doc.cats[cat] for cat in doc.cats])
    titles.append(str(idx) +": " + temp)
    
    #print(doc.cats)
    
    guess = max(zip(doc.cats.values(), doc.cats.keys()))[1]
    label = "INCORRECT" if guess != test.category[idx] else guess
    labels.append(label)

save_vectors(vectors)
save_metadata(titles,labels)

In [121]:
test.category[175917], test.headline[175917]

('TRAVEL', 'A Mekong Meditation')