In [162]:
import numpy as np 
import pandas as pd 
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import nltk
nltk.download('stopwords')
nltk.download('punkt') # for nltk's sentence tokenization within word_tokenize
nltk.download('wordnet') # for nltk's sentence tokenization within word_tokenize


[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [163]:
custom_stop_words = set(nltk.corpus.stopwords.words('english'))

In [164]:
path = os.path.join('data-payames', 'data-payames.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,text,title,label
0,0,U.S. Secretary of State John F. Kerry said Mon...,Kerry to go to Paris in gesture of sympathy,1
1,1,It's primary day in New York and front-runners...,The Battle of New York: Why This Primary Matters,1
2,2,A Czech stockbroker who saved more than 650 Je...,‘Britain’s Schindler’ Dies at 106,1
3,3,Hillary Clinton and Donald Trump made some ina...,Fact check: Trump and Clinton at the 'commande...,1
4,4,Iranian negotiators reportedly have made a las...,Iran reportedly makes new push for uranium con...,1


In [165]:
# drop rows with nan
# but this retains the original df index for rows
# so reset that index (axis is 0)
df.dropna(inplace=True)
df.reset_index(0, drop=True, inplace=True) # there are 8 rows with only title and label

In [166]:
''.join(sorted(list(set(''.join([i for i in df.text]))))) # the set of all characters that appear over all texts

'\t\n !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x8a\x8b\x8c\x8f\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9f\xa0¡¢£¥§¨©ª«\xad®¯°±²´¶·¸¹º»¼½¾¿ÀÁÂÄÅÇÈÉÍÎÐÑÓÖ×ØÚÜßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüāĂăčďđğīİŃńōœŚşŠšŤťŸŹș˚˜̄ΝάίαβεηικλμνορυГЭабвгийклнорстуяאבגדהוזחטיךכלםמןנסעףפץצקרשתءأإابةتجحدذرضعفقلمنهوىيงยรẓị\u2002\u2008\u2009\u200a\u200b\u200c\u200e‑–—―‘’“”†‡•…\u202f′″‹›⁄€₹™⅔→⇩►●☕☢☮♥♦✓✟❖❦\uf022\uf0b7\uf0dd\uf50d\uf64c\uf6a8️\ufeff￼🇸🇺🌍🎃💚😀😂😉😍😭😳🙂🙄🧀'

In [167]:
def remove_junk_chars(text):
    return ''.join(ch for ch in text if ch in 'abcdefghijklmnopqrstuvwxyz''ABCDEFGHIJKLMNOPQRSTUVWXYZ''0123456789'' .,')

In [168]:
custom_stop_words = custom_stop_words.union(set(remove_junk_chars(text) for text in custom_stop_words))

In [169]:
# def clean_text(text_ser: pd.Series):
#     new_text_list = []
#     for text in text_ser:
#         text = remove_junk_chars(text)
#         new_text_list.append(text)
#     return pd.Series(new_text_list)


In [170]:
# df.title = clean_text(df.title)
# df.text = clean_text(df.text)

In [171]:
traindf, testdf = train_test_split(df)
len(df), len(traindf), len(testdf)

(19909, 14931, 4978)

In [172]:
ytrain = traindf.label
ytest = testdf.label

In [173]:
def train_and_score(model, xtrain, ytrain, xtest, ytest):
    model.fit(xtrain, ytrain)
    # train_score = model.score(xtrain, ytrain)
    test_score = model.score(xtest, ytest)
    print('For', model)
    # print('Train score: ', train_score)    
    print('Test score: ', test_score)
    return test_score

def train_and_score_title_and_text(model, xtrain, ytrain, xtest, ytest): # doesn't work right now
    model.fit(xtrain.text, ytrain)
    # train_score = model.score(xtrain, ytrain)
    test_score = model.score(xtest, ytest)
    print('For', model)
    # print('Train score: ', train_score)    
    print('Test score: ', test_score)
    return test_score

In [174]:
class TokenizerAndStemmer(object):
    stem = nltk.PorterStemmer().stem
    def __call__(self, text):
        return (self.stem(remove_junk_chars(itoken)) for itoken in nltk.word_tokenize(text)) # returning a generator is more efficient

class TokenizerAndLancasterStemmer(object):
    stem = nltk.LancasterStemmer().stem
    def __call__(self, text):
        return (self.stem(remove_junk_chars(itoken)) for itoken in nltk.word_tokenize(text))

class TokenizerAndLemmatizer(object):
    lemma = nltk.WordNetLemmatizer().lemmatize
    def __call__(self, text):
        return (self.lemma(remove_junk_chars(itoken)) for itoken in nltk.word_tokenize(text))
        

In [175]:
vectorizations = {
    'bow':          CountVectorizer,
    'tfidf':        TfidfVectorizer,
    'bigram':       lambda *args, **kwargs: CountVectorizer(ngram_range=(2, 2), *args, **kwargs),
}
normalizations = {
    'stem':         TokenizerAndStemmer(),
    'stemmer':      TokenizerAndStemmer(),
    'porter':       TokenizerAndStemmer(),
    'lanc':         TokenizerAndLancasterStemmer(),
    'lem':          TokenizerAndLemmatizer(),
    'lemmer':       TokenizerAndLemmatizer(),
    'lemmatizer':   TokenizerAndLemmatizer(),
    'default':      None,
    'def':          None,
}
algorithms = {
    'mnb':          MultinomialNB,
    'svm':          SVC,
}
results = {}

In [176]:
def decode(specification):
    str_vectorization, str_max_features, str_normalization, str_algorithm = specification.split()
    max_features = int(str_max_features)
    vectorizer = vectorizations[str_vectorization](
        stop_words=custom_stop_words,
        tokenizer=normalizations[str_normalization],
        max_features=max_features,
    )
    algomodel = algorithms[str_algorithm]()
    return vectorizer, algomodel

In [177]:
%%time
name = 'bow 1000 default mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .86
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8589795098433106
CPU times: user 9.52 s, sys: 51.8 ms, total: 9.57 s
Wall time: 9.6 s


In [178]:
%%time
name = 'bow 1000 porter mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .85
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndStemmer object at 0x7fdc24b85fd0>)),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8479308959421454
CPU times: user 4min 39s, sys: 239 ms, total: 4min 39s
Wall time: 4min 39s


In [194]:
%%time
name = 'bow 1000 lanc mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .84
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLancasterStemmer object at 0x7fdc24b85820>)),
                ('multinomialnb', MultinomialNB())])
Test score:  0.836078746484532
CPU times: user 4min 3s, sys: 247 ms, total: 4min 3s
Wall time: 4min 3s


In [195]:
%%time
name = 'bow 1000 lemmatizer mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .87; finally an improvement!
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8686219365206911
CPU times: user 2min 31s, sys: 287 ms, total: 2min 31s
Wall time: 2min 32s


In [184]:
%%time
name = 'bigram 1000 default mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .89
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8897147448774608
CPU times: user 32.9 s, sys: 921 ms, total: 33.8 s
Wall time: 35.4 s


In [185]:
%%time
name = 'tfidf 1000 default mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .86
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8648051426275613
CPU times: user 9.7 s, sys: 36 ms, total: 9.74 s
Wall time: 10.9 s


In [186]:
%%time
name = 'bow 1000 default svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .91
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('svc', SVC())])
Test score:  0.9152269987946967
CPU times: user 2min 7s, sys: 396 ms, total: 2min 7s
Wall time: 2min 9s


In [204]:
%%time
name = 'bigram 1000 lemmatizer mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .87
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8748493370831659
CPU times: user 2min 33s, sys: 420 ms, total: 2min 34s
Wall time: 2min 34s


In [207]:
%%time
name = 'tfidf 1000 lemmatizer mnb'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .86
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('multinomialnb', MultinomialNB())])
Test score:  0.8623945359582161
CPU times: user 2min 41s, sys: 31.7 ms, total: 2min 41s
Wall time: 2min 41s


In [205]:
%%time
name = 'bow 1000 lemmatizer svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .80
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('svc', SVC())])
Test score:  0.8029329047810365
CPU times: user 4min 6s, sys: 71.5 ms, total: 4min 6s
Wall time: 4min 6s


In [208]:
%%time
name = 'bigram 1000 lemmatizer svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .79
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('svc', SVC())])
Test score:  0.7870630775411812
CPU times: user 3min 54s, sys: 752 ms, total: 3min 55s
Wall time: 3min 55s


In [217]:
%%time
name = 'bigram 1000 default svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .89
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('svc', SVC())])
Test score:  0.8869023704298915
CPU times: user 53.3 s, sys: 1 s, total: 54.3 s
Wall time: 55.5 s


In [209]:
%%time
name = 'tfidf 1000 lemmatizer svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .95
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...},
                                 tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)),
                ('svc', SVC())])
Test score:  0.9463640016070711
CPU times: user 4min 2s, sys: 132 ms, total: 4min 2s
Wall time: 4min 2s


In [218]:
%%time
name = 'tfidf 1000 default svm'
my_vectorizer, my_algomodel = decode(name)
my_pipeline = make_pipeline(my_vectorizer, my_algomodel)
test_score = train_and_score(my_pipeline, traindf.text, ytrain, testdf.text, ytest) # .93
results[name] = (my_vectorizer, my_algomodel, test_score)


For Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'arent', 'as',
                                             'at', 'be', 'because', 'been',
                                             'before', 'being', 'below',
                                             'between', 'both', 'but', 'by',
                                             'can', 'couldn', ...})),
                ('svc', SVC())])
Test score:  0.9357171554841301
CPU times: user 2min 16s, sys: 168 ms, total: 2min 16s
Wall time: 2min 16s


In [216]:
from datetime import datetime
import joblib
fout = open('Results.txt', 'a')
print(file=fout)
for name, rest in results.items():
    my_vectorizer, my_algomodel, test_score = rest
    timenow = str(datetime.now())
    print(name, timenow, test_score, sep='\t', file=fout)
    joblib.dump(make_pipeline(my_vectorizer, my_algomodel), f'{timenow} {name}.joblib')
fout.close()

In [212]:
results

{'bow 1000 default mnb': (CountVectorizer(max_features=1000,
                  stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                              'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                              'aren', "aren't", 'arent', 'as', 'at', 'be',
                              'because', 'been', 'before', 'being', 'below',
                              'between', 'both', 'but', 'by', 'can', 'couldn', ...}),
  MultinomialNB(),
  0.8589795098433106),
 'bow 1000 porter mnb': (CountVectorizer(max_features=1000,
                  stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                              'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                              'aren', "aren't", 'arent', 'as', 'at', 'be',
                              'because', 'been', 'before', 'being', 'below',
                              'between', 'both', 'but', 'by', 'can', 'couldn', ...},
                  tokenizer=<__main__.Tokeni

In [219]:
from sklearn.metrics import roc_curve, roc_auc_score


In [228]:
# for name, rest in results:
rest = results['tfidf 1000 lemmatizer svm']
model = make_pipeline(rest[0], SVC(probability=True))
yproba = model.predict_proba(testdf.text)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc = roc_auc_score(y_test, yproba)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [222]:
model

TfidfVectorizer(max_features=1000,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'arent', 'as', 'at', 'be',
                            'because', 'been', 'before', 'being', 'below',
                            'between', 'both', 'but', 'by', 'can', 'couldn', ...},
                tokenizer=<__main__.TokenizerAndLemmatizer object at 0x7fdc24b85130>)