In [5]:
import pandas as p
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [2]:
%store -r data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [3]:
X = data['clean_text'].values
y = data['target'].values
print(len(X))
print(len(y))

1600000
1600000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1584000,) (1584000,) (16000,) (16000,)


In [6]:
def show_most_and_least_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    print("Top ", n, " most and least informative features")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [7]:
def model(X_train, y_train, X_test, y_test, vectorizer, clf):
    
    X_train_vec = vectorizer.fit_transform(X_train)
    print("X_train_vec shape - ", X_train_vec.shape, "\n")
    X_test_vec = vectorizer.transform(X_test)
    print("X_test_vec shape - ", X_test_vec.shape, "\n")
    
    y_pred = clf.fit(X_train_vec, y_train).predict(X_test_vec)
    show_most_and_least_informative_features(vectorizer, clf, n=20)
    cm = confusion_matrix(y_test, y_pred, labels=[0,4])
    print()
    print("Confusion matrix\n", cm, "\n")

    print("Classification report\n", classification_report(y_test, y_pred, target_names=['0','4']))
    
    return y_pred, vectorizer, clf

In [8]:
# MNB model 1
y_pred_mbn_1, vectorizer_mnb_1, clf_mnb_1 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', binary=False), MultinomialNB())

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 

Top  20  most and least informative features
	-16.0232	aabot          		-5.2207	at             
	-16.0232	aabout         		-5.1276	be             
	-16.0232	aabt           		-4.9863	have           
	-16.0232	aacattyisamazing		-4.9837	just           
	-16.0232	aacchhoo       		-4.9760	good           
	-16.0232	aacck          		-4.9482	with           
	-16.0232	aacckk         		-4.9422	so             
	-16.0232	aach           		-4.8551	me             
	-16.0232	aachar         		-4.7055	that           
	-16.0232	aachee         		-4.6925	on             
	-16.0232	aachhoo        		-4.6137	of             
	-16.0232	aacount        		-4.5088	in             
	-16.0232	aacs           		-4.4348	is             
	-16.0232	aadaamm        		-4.3605	for            
	-16.0232	aadam          		-4.2897	my             
	-16.0232	aaden          		-4.1297	it             
	-16.0232	aadha          		-4.1171	and            
	-16.023