In [1]:
import pandas as p
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
%store -r data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [3]:
X = data['clean_text'].values
y = data['target'].values
print(len(X))
print(len(y))

1600000
1600000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1584000,) (1584000,) (16000,) (16000,)


In [5]:
def show_most_and_least_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    print("Top ", n, " most and least informative features")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [6]:
def model(X_train, y_train, X_test, y_test, vectorizer, clf):
    
    X_train_vec = vectorizer.fit_transform(X_train)
    print("X_train_vec shape - ", X_train_vec.shape, "\n")
    X_test_vec = vectorizer.transform(X_test)
    print("X_test_vec shape - ", X_test_vec.shape, "\n")
    
    y_pred = clf.fit(X_train_vec, y_train).predict(X_test_vec)
    show_most_and_least_informative_features(vectorizer, clf, n=20)
    cm = confusion_matrix(y_test, y_pred, labels=[0,4])
    print()
    print("Confusion matrix\n", cm, "\n")

    print("Classification report\n", classification_report(y_test, y_pred, target_names=['0','4'], digits=4))
    
    return y_pred, vectorizer, clf

In [7]:
# MNB unigram baseline model
y_pred_mbn_1, vectorizer_mnb_1, clf_mnb_1 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1'), MultinomialNB())

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 

Top  20  most and least informative features
	-16.0232	aabot          		-5.2207	at             
	-16.0232	aabout         		-5.1276	be             
	-16.0232	aabt           		-4.9863	have           
	-16.0232	aacattyisamazing		-4.9837	just           
	-16.0232	aacchhoo       		-4.9760	good           
	-16.0232	aacck          		-4.9482	with           
	-16.0232	aacckk         		-4.9422	so             
	-16.0232	aach           		-4.8551	me             
	-16.0232	aachar         		-4.7055	that           
	-16.0232	aachee         		-4.6925	on             
	-16.0232	aachhoo        		-4.6137	of             
	-16.0232	aacount        		-4.5088	in             
	-16.0232	aacs           		-4.4348	is             
	-16.0232	aadaamm        		-4.3605	for            
	-16.0232	aadam          		-4.2897	my             
	-16.0232	aaden          		-4.1297	it             
	-16.0232	aadha          		-4.1171	and            
	-16.023

In [78]:
# MNB unigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1')),('nb', MultinomialNB())])
parameters = {
    'tf__max_df': (0.01, 0.1, 0.2, 1.0),
    'tf__min_df': (5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 14.1min finished


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=5, ngram_range=(1, 1), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]


In [8]:
# MNB unigram tf best model
y_pred_mbn_2, vectorizer_mnb_2, clf_mnb_2 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', min_df=5, max_df=0.2), MultinomialNB())

X_train_vec shape -  (1584000, 49535) 

X_test_vec shape -  (16000, 49535) 

Top  20  most and least informative features
	-15.9242	aaden          		-5.1506	day            
	-15.9242	aaghh          		-5.1428	but            
	-15.9242	aarggh         		-5.1217	at             
	-15.9242	aargghh        		-5.0287	be             
	-15.9242	aarrgg         		-4.8873	have           
	-15.9242	aarrghh        		-4.8847	just           
	-15.9242	abre           		-4.8771	good           
	-15.9242	abscess        		-4.8492	with           
	-15.9242	abuser         		-4.8433	so             
	-15.9242	abuses         		-4.7562	me             
	-15.9242	acces          		-4.6066	that           
	-15.9242	accuweather    		-4.5936	on             
	-15.9242	acidic         		-4.5148	of             
	-15.9242	adamisarockstar		-4.4099	in             
	-15.9242	addictedto     		-4.3359	is             
	-15.9242	afriad         		-4.2616	for            
	-15.9242	afridi         		-4.1907	my             
	-15.9242	a

In [87]:
# MNB unigram tf-idf tuning
pipeline = Pipeline([('tfidf', TfidfVectorizer(encoding='latin-1')),('nb', MultinomialNB())])
parameters = {
    'tfidf__max_df': (0.01, 0.1, 0.2, 1.0),
    'tfidf__min_df': (5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 12.5min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=10, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]


In [9]:
# MNB unigram tf-idf best model
y_pred_mbn_3, vectorizer_mnb_3, clf_mnb_3 = model(X_train, y_train, X_test, y_test, TfidfVectorizer(encoding='latin-1', min_df=10), MultinomialNB())

X_train_vec shape -  (1584000, 32041) 

X_test_vec shape -  (16000, 32041) 

Top  20  most and least informative features
	-14.6490	aarrghh        		-5.3847	have           
	-14.6490	abscess        		-5.3723	so             
	-14.6490	alergies       		-5.3721	just           
	-14.6490	alternator     		-5.3545	love           
	-14.6490	alzheimer      		-5.3093	with           
	-14.6490	antihistamines 		-5.2982	thanks         
	-14.6490	arrhh          		-5.2898	me             
	-14.6490	auchh          		-5.1913	on             
	-14.6490	ayatollah      		-5.1863	of             
	-14.6490	backorder      		-5.1199	that           
	-14.6490	bandages       		-5.1114	in             
	-14.6490	barakatday     		-5.0738	good           
	-14.6490	basij          		-4.9708	is             
	-14.6490	beefin         		-4.9639	my             
	-14.6490	blistered      		-4.8838	for            
	-14.6490	bodyguards     		-4.8403	and            
	-14.6490	bogged         		-4.7332	it             
	-14.6490	b

In [10]:
# MNB bigram baseline model
y_pred_mbn_4, vectorizer_mnb_4, clf_mnb_4 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', ngram_range=(1,2)), MultinomialNB())

X_train_vec shape -  (1584000, 3594521) 

X_test_vec shape -  (16000, 3594521) 

Top  20  most and least informative features
	-16.8362	aa after       		-6.0337	at             
	-16.8362	aa against     		-5.9407	be             
	-16.8362	aa all         		-5.7994	have           
	-16.8362	aa allstars    		-5.7967	just           
	-16.8362	aa already     		-5.7891	good           
	-16.8362	aa another     		-5.7612	with           
	-16.8362	aa anyone      		-5.7553	so             
	-16.8362	aa are         		-5.6682	me             
	-16.8362	aa as          		-5.5186	that           
	-16.8362	aa asyik       		-5.5056	on             
	-16.8362	aa at          		-5.4268	of             
	-16.8362	aa baas        		-5.3219	in             
	-16.8362	aa baby        		-5.2479	is             
	-16.8362	aa ball        		-5.1736	for            
	-16.8362	aa batterys    		-5.1028	my             
	-16.8362	aa bed         		-4.9428	it             
	-16.8362	aa beta        		-4.9302	and            
	-16.83

In [97]:
# MNB bigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1', ngram_range=(1,2))),('nb', MultinomialNB())])
parameters = {
    'tf__max_df': (0.05, 0.1, 0.2, 1.0),
    'tf__min_df': (2, 5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 34.8min finished


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.1, max_features=None,
                min_df=2, ngram_range=(1, 2), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]


In [11]:
# MNB bigram tf best model
y_pred_mbn_5, vectorizer_mnb_5, clf_mnb_5 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=2, max_df=0.1), MultinomialNB())

X_train_vec shape -  (1584000, 1012914) 

X_test_vec shape -  (16000, 1012914) 

Top  20  most and least informative features
	-16.5273	aa at          		-5.9050	can            
	-16.5273	aa boring      		-5.9024	this           
	-16.5273	aa come        		-5.9020	up             
	-16.5273	aa demi        		-5.8977	all            
	-16.5273	aa did         		-5.8432	are            
	-16.5273	aa doesn       		-5.8335	your           
	-16.5273	aa don         		-5.8148	was            
	-16.5273	aa feel        		-5.7645	love           
	-16.5273	aa fuck        		-5.7536	day            
	-16.5273	aa hate        		-5.7458	but            
	-16.5273	aa how         		-5.7248	at             
	-16.5273	aa it          		-5.6318	be             
	-16.5273	aa its         		-5.4904	have           
	-16.5273	aa ive         		-5.4878	just           
	-16.5273	aa man         		-5.4801	good           
	-16.5273	aa on          		-5.4523	with           
	-16.5273	aa only        		-5.4464	so             
	-16.52

In [99]:
# MNB bigram tf-idf tuning
pipeline = Pipeline([('tfidf', TfidfVectorizer(encoding='latin-1', ngram_range=(1,2))),('nb', MultinomialNB())])
parameters = {
    'tfidf__max_df': (0.05, 0.1, 0.2, 1.0),
    'tfidf__min_df': (2, 5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 18.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 29.9min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.1, max_features=None,
                min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]


In [12]:
# MNB bigram tf-idf best model
y_pred_mbn_6, vectorizer_mnb_6, clf_mnb_6 = model(X_train, y_train, X_test, y_test, TfidfVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=2, max_df=0.1), MultinomialNB())

X_train_vec shape -  (1584000, 1012914) 

X_test_vec shape -  (16000, 1012914) 

Top  20  most and least informative features
	-15.1928	aa at          		-6.8585	all            
	-15.1928	aa boring      		-6.8525	up             
	-15.1928	aa come        		-6.8521	now            
	-15.1928	aa demi        		-6.8196	was            
	-15.1928	aa did         		-6.8126	lol            
	-15.1928	aa doesn       		-6.7425	at             
	-15.1928	aa don         		-6.7192	are            
	-15.1928	aa feel        		-6.6887	your           
	-15.1928	aa fuck        		-6.6698	be             
	-15.1928	aa hate        		-6.6179	day            
	-15.1928	aa how         		-6.5515	have           
	-15.1928	aa it          		-6.5409	so             
	-15.1928	aa its         		-6.5297	just           
	-15.1928	aa ive         		-6.4928	love           
	-15.1928	aa man         		-6.4494	with           
	-15.1928	aa on          		-6.4467	me             
	-15.1928	aa only        		-6.3379	on             
	-15.19

In [27]:
def printErrors(y_pred, top=10):
    print(*([(X_test[i], y_test[i], y_pred[i]) for i in range(len(y_test)) if y_pred[i] != y_test[i]][:top]), sep='\n')

In [31]:
print("Unigram tf model")
printErrors(y_pred_mbn_1, top=20)
print()
# print("Unigram tf-idf model")
# printErrors(y_pred_mbn_3, top=20)
print()
print("Bigram tf model")
printErrors(y_pred_mbn_5, top=20)
print()
# print("Bigram tf-idf model")
# printErrors(y_pred_mbn_6, top=20)

Unigram tf model
('where the are my pinking shears rarararrarararr babyproofing while cutting stuff makes me stick shears random places forget them', 0, 4)
('not bad bit grumpy cause of exams but generally ok ta', 4, 0)
('can watch it what is it', 0, 4)
('song of my life now your love is lie simple plan beautifulylost', 0, 4)
('watching the last leno so glad got to go once', 0, 4)
('dropped your books off in the library', 4, 0)
('sun burns are noo fun bored sittin at home watching bride wars with my sister have good weekend everyone', 0, 4)
('looking for mascot for needs to be an octopus but the one youhave isnt cutting it if sign up ull see', 0, 4)
('no my dear then you just do what am doing read it again', 4, 0)
('umm its getting betterr than before but its still pretty bad lol', 4, 0)
('huh turns out like marmite when did stop being loved by all', 0, 4)
('arrgghh going to sydney for the first time on the holidays and then off to canberra and somewhere else in snow', 4, 0)
('thanks r