In [1]:
import pandas as p
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [2]:
%store -r data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [3]:
X = data['clean_text'].values
y = data['target'].values
print(len(X))
print(len(y))

1600000
1600000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1584000,) (1584000,) (16000,) (16000,)


In [5]:
def show_most_and_least_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    print("Top ", n, " most and least informative features")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [6]:
def model(X_train, y_train, X_test, y_test, vectorizer, clf):
    
    X_train_vec = vectorizer.fit_transform(X_train)
    print("X_train_vec shape - ", X_train_vec.shape, "\n")
    X_test_vec = vectorizer.transform(X_test)
    print("X_test_vec shape - ", X_test_vec.shape, "\n")
    
    y_pred = clf.fit(X_train_vec, y_train).predict(X_test_vec)
    show_most_and_least_informative_features(vectorizer, clf, n=20)
    cm = confusion_matrix(y_test, y_pred, labels=[0,4])
    print()
    print("Confusion matrix\n", cm, "\n")

    print("Classification report\n", classification_report(y_test, y_pred, target_names=['0','4'], digits=4))
    
    return y_pred, vectorizer, clf

In [7]:
# Bernoulli NB unigram baseline model
y_pred_mbn_1, vectorizer_mnb_1, clf_mnb_1 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', binary=True), BernoulliNB())

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 

Top  20  most and least informative features
	-13.5822	aabot          		-2.8219	but            
	-13.5822	aabout         		-2.7345	be             
	-13.5822	aabt           		-2.5922	have           
	-13.5822	aacattyisamazing		-2.5831	good           
	-13.5822	aacchhoo       		-2.5678	just           
	-13.5822	aacck          		-2.5637	so             
	-13.5822	aacckk         		-2.5438	with           
	-13.5822	aach           		-2.4756	me             
	-13.5822	aachar         		-2.3313	that           
	-13.5822	aachee         		-2.3106	on             
	-13.5822	aachhoo        		-2.2492	of             
	-13.5822	aacount        		-2.1482	in             
	-13.5822	aacs           		-2.0675	is             
	-13.5822	aadaamm        		-1.9901	for            
	-13.5822	aadam          		-1.9487	my             
	-13.5822	aaden          		-1.8386	it             
	-13.5822	aadha          		-1.7848	and            
	-13.582

In [8]:
# Bernoulli NB unigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1', binary=True)),('nb', BernoulliNB())])
parameters = {
    'tf__max_df': (0.01, 0.1, 0.2, 1.0),
    'tf__min_df': (5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 20.3min finished


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=1, ngram_range=(1, 1), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('nb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]


In [9]:
# MNB unigram tf best model
y_pred_mbn_2, vectorizer_mnb_2, clf_mnb_2 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', binary=True, max_df=0.2), BernoulliNB())

X_train_vec shape -  (1584000, 243573) 

X_test_vec shape -  (16000, 243573) 

Top  20  most and least informative features
	-13.5822	aabot          		-2.8733	day            
	-13.5822	aabout         		-2.8272	at             
	-13.5822	aabt           		-2.8219	but            
	-13.5822	aacattyisamazing		-2.7345	be             
	-13.5822	aacchhoo       		-2.5922	have           
	-13.5822	aacck          		-2.5831	good           
	-13.5822	aacckk         		-2.5678	just           
	-13.5822	aach           		-2.5637	so             
	-13.5822	aachar         		-2.5438	with           
	-13.5822	aachee         		-2.4756	me             
	-13.5822	aachhoo        		-2.3313	that           
	-13.5822	aacount        		-2.3106	on             
	-13.5822	aacs           		-2.2492	of             
	-13.5822	aadaamm        		-2.1482	in             
	-13.5822	aadam          		-2.0675	is             
	-13.5822	aaden          		-1.9901	for            
	-13.5822	aadha          		-1.9487	my             
	-13.582

In [10]:
# Bernoulli NB bigram baseline model
y_pred_mbn_3, vectorizer_mnb_3, clf_mnb_3 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', binary=True, ngram_range=(1,2)), BernoulliNB())

X_train_vec shape -  (1584000, 3594521) 

X_test_vec shape -  (16000, 3594521) 

Top  20  most and least informative features
	-13.5822	aa after       		-2.8219	but            
	-13.5822	aa against     		-2.7345	be             
	-13.5822	aa all         		-2.5922	have           
	-13.5822	aa allstars    		-2.5831	good           
	-13.5822	aa already     		-2.5678	just           
	-13.5822	aa another     		-2.5637	so             
	-13.5822	aa anyone      		-2.5438	with           
	-13.5822	aa are         		-2.4756	me             
	-13.5822	aa as          		-2.3313	that           
	-13.5822	aa asyik       		-2.3106	on             
	-13.5822	aa at          		-2.2492	of             
	-13.5822	aa baas        		-2.1482	in             
	-13.5822	aa baby        		-2.0675	is             
	-13.5822	aa ball        		-1.9901	for            
	-13.5822	aa batterys    		-1.9487	my             
	-13.5822	aa bed         		-1.8386	it             
	-13.5822	aa beta        		-1.7848	and            
	-13.58

In [11]:
# Bernoulli NB bigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1', binary=True, ngram_range=(1,2))),('nb', BernoulliNB())])
parameters = {
    'tf__max_df': (0.05, 0.1, 0.2, 1.0),
    'tf__min_df': (2, 5, 10, 20, 1)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed: 19.9min remaining:   41.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 21.0min finished


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.05, max_features=None,
                min_df=1, ngram_range=(1, 2), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('nb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]


In [12]:
# Bernoulli NB bigram tf best model
y_pred_mbn_4, vectorizer_mnb_4, clf_mnb_4 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', binary=True, ngram_range=(1,2), max_df=0.05), BernoulliNB())

X_train_vec shape -  (1584000, 3594492) 

X_test_vec shape -  (16000, 3594492) 

Top  20  most and least informative features
	-13.5822	aa after       		-3.4327	new            
	-13.5822	aa against     		-3.3809	go             
	-13.5822	aa all         		-3.3807	got            
	-13.5822	aa allstars    		-3.3736	from           
	-13.5822	aa already     		-3.3476	too            
	-13.5822	aa another     		-3.3424	will           
	-13.5822	aa anyone      		-3.3121	what           
	-13.5822	aa are         		-3.3044	time           
	-13.5822	aa as          		-3.3005	today          
	-13.5822	aa asyik       		-3.2912	do             
	-13.5822	aa at          		-3.2813	going          
	-13.5822	aa baas        		-3.2227	we             
	-13.5822	aa baby        		-3.1592	thanks         
	-13.5822	aa ball        		-3.1462	lol            
	-13.5822	aa batterys    		-3.1202	get            
	-13.5822	aa bed         		-3.1016	like           
	-13.5822	aa beta        		-3.0656	out            
	-13.58

In [13]:
def printErrors(y_pred, top=10):
    print(*([(X_test[i], y_test[i], y_pred[i]) for i in range(len(y_test)) if y_pred[i] != y_test[i]][:top]), sep='\n')

In [14]:
print("Unigram tf model")
printErrors(y_pred_mbn_1, top=20)
print()
print()
print("Bigram tf model")
printErrors(y_pred_mbn_4, top=20)

Unigram tf model
('where the are my pinking shears rarararrarararr babyproofing while cutting stuff makes me stick shears random places forget them', 0, 4)
('not bad bit grumpy cause of exams but generally ok ta', 4, 0)
('can watch it what is it', 0, 4)
('song of my life now your love is lie simple plan beautifulylost', 0, 4)
('watching the last leno so glad got to go once', 0, 4)
('dropped your books off in the library', 4, 0)
('do more that anything', 0, 4)
('looking for mascot for needs to be an octopus but the one youhave isnt cutting it if sign up ull see', 0, 4)
('once again show is rescheduled', 0, 4)
('no my dear then you just do what am doing read it again', 4, 0)
('umm its getting betterr than before but its still pretty bad lol', 4, 0)
('huh turns out like marmite when did stop being loved by all', 0, 4)
('arrgghh going to sydney for the first time on the holidays and then off to canberra and somewhere else in snow', 4, 0)
('thanks really appreciate that my skin is acting up