In [1]:
import pandas as p
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
%store -r data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [3]:
X = data['clean_text'].values
y = data['target'].values
print(len(X))
print(len(y))

1600000
1600000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1584000,) (1584000,) (16000,) (16000,)


In [5]:
def show_most_and_least_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    importances = list(clf.feature_importances_)
    coefs_with_fns = sorted(zip(importances, feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    print("Top ", n, " most and least informative features")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [6]:
def model(X_train, y_train, X_test, y_test, vectorizer, clf):
    
    X_train_vec = vectorizer.fit_transform(X_train)
    print("X_train_vec shape - ", X_train_vec.shape, "\n")
    X_test_vec = vectorizer.transform(X_test)
    print("X_test_vec shape - ", X_test_vec.shape, "\n")
    
    y_pred = clf.fit(X_train_vec, y_train).predict(X_test_vec)
    show_most_and_least_informative_features(vectorizer, clf, n=20)
    cm = confusion_matrix(y_test, y_pred, labels=[0,4])
    print()
    print("Confusion matrix\n", cm, "\n")

    print("Classification report\n", classification_report(y_test, y_pred, target_names=['0','4']))
    
    return y_pred, vectorizer, clf

In [34]:
# Random Forest unigram baseline model
y_pred_rf_1, vectorizer_rf_1, clf_rf_1 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1'), RandomForestClassifier(max_depth=48, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


Top  20  most and least informative features
	0.0000	aab            		0.0079	ugh            
	0.0000	aabangan       		0.0081	awesome        
	0.0000	aabar          		0.0084	hurts          
	0.0000	aabhar         		0.0084	wish           
	0.0000	aabinker       		0.0092	why            
	0.0000	aabot          		0.0095	sick           
	0.0000	aabout         		0.0098	hate           
	0.0000	aabt           		0.0101	thank          
	0.0000	aacattyisamazing		0.0106	but            
	0.0000	aaccee         		0.0106	sorry          
	0.0000	aacchhoo       		0.0110	love           
	0.0000	aacck          		0.0125	work           
	0.0000	aacckk         		0.0136	thanks         
	0.0000	aacd           		0.0154	good           
	0.0000	aach           		0.0159	my             
	0.0000	aachar         		0.0169	no             
	0.0000	aache          		0.0173	miss           
	0.0000	aachee         		0.0182	not            
	0.0000	aachei         		0.0245	you            
	0.0000	aachen         		0.0268	sad       

In [9]:
# Random Forest unigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1')),('rf', RandomForestClassifier(max_depth=48, n_jobs=-1, random_state=0, verbose=1))])
parameters = {
    'tf__max_df': (0.1, 0.2, 1.0),
    'tf__min_df': (5, 10, 20, 1),
    'rf__max_features': ("auto", 0.001, 0.005)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 35.4min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 252.2min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   50.3s


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=1, ngram_range=(1, 1), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=48, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_st

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished


In [12]:
# Random Forest unigram tf best model
y_pred_rf_2, vectorizer_rf_2, clf_rf_2 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', min_df=1, max_df=0.2), RandomForestClassifier(n_estimators=1000, max_depth=48, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 243573) 

X_test_vec shape -  (16000, 243573) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 48.6min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 68.3min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 70.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.1s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:    1.7s
[Parallel(n_job

Top  20  most and least informative features
	0.0000	aab            		0.0085	sucks          
	0.0000	aabangan       		0.0086	your           
	0.0000	aabar          		0.0094	great          
	0.0000	aabhar         		0.0097	bad            
	0.0000	aabinker       		0.0103	work           
	0.0000	aabot          		0.0105	thank          
	0.0000	aabout         		0.0107	sorry          
	0.0000	aaccee         		0.0112	hate           
	0.0000	aacck          		0.0114	wish           
	0.0000	aach           		0.0119	sick           
	0.0000	aache          		0.0120	but            
	0.0000	aachei         		0.0135	my             
	0.0000	aachens        		0.0137	good           
	0.0000	aadn           		0.0143	no             
	0.0000	aae            		0.0160	love           
	0.0000	aaeeaa         		0.0175	miss           
	0.0000	aaeew          		0.0189	not            
	0.0000	aafech         		0.0216	thanks         
	0.0000	aafter         		0.0241	sad            
	0.0000	aafyh          		0.0276	you        

In [7]:
# Random Forest unigram tf-idf tuning
pipeline = Pipeline([('tfidf', TfidfVectorizer(encoding='latin-1')),('rf', RandomForestClassifier(max_depth=48, n_jobs=-1, random_state=0, verbose=1))])
parameters = {
    'tfidf__max_df': (0.1, 0.2, 1.0),
    'tfidf__min_df': (5, 10, 20, 1),
    'rf__max_features': ("auto", 0.001, 0.005)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 243.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   47.7s


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=48, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_sco

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.5min finished


In [8]:
# Random Forest unigram tf-idf best model
y_pred_rf_3, vectorizer_rf_3, clf_rf_3 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', max_df=0.2, min_df=1), RandomForestClassifier(n_estimators=1000, max_depth=48, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 243573) 

X_test_vec shape -  (16000, 243573) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 31.6min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Top  20  most and least informative features
	0.0000	aab            		0.0084	sucks          
	0.0000	aabangan       		0.0085	your           
	0.0000	aabar          		0.0095	thank          
	0.0000	aabhar         		0.0099	work           
	0.0000	aabinker       		0.0100	bad            
	0.0000	aabot          		0.0105	sorry          
	0.0000	aabout         		0.0106	great          
	0.0000	aacattyisamazing		0.0108	wish           
	0.0000	aaccee         		0.0112	but            
	0.0000	aacchhoo       		0.0114	hate           
	0.0000	aacck          		0.0127	sick           
	0.0000	aach           		0.0139	my             
	0.0000	aachar         		0.0142	no             
	0.0000	aache          		0.0143	good           
	0.0000	aachee         		0.0159	miss           
	0.0000	aachei         		0.0165	love           
	0.0000	aachen         		0.0201	not            
	0.0000	aachens        		0.0216	thanks         
	0.0000	aachhoo        		0.0235	sad            
	0.0000	aadha          		0.0301	you       

In [15]:
# Random Forest bigram baseline model
y_pred_rf_4, vectorizer_rf_4, clf_rf_4 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', ngram_range=(1,2)), RandomForestClassifier(n_estimators=100, max_depth=64, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 3594521) 

X_test_vec shape -  (16000, 3594521) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


Top  20  most and least informative features
	0.0000	aa aa          		0.0041	didn           
	0.0000	aa after       		0.0041	hurts          
	0.0000	aa against     		0.0042	ugh            
	0.0000	aa alcohol     		0.0044	want to        
	0.0000	aa all         		0.0045	hate           
	0.0000	aa allstars    		0.0047	don            
	0.0000	aa already     		0.0050	no             
	0.0000	aa am          		0.0051	still          
	0.0000	aa aml         		0.0051	thanks for     
	0.0000	aa and         		0.0054	happy          
	0.0000	aa angrezo     		0.0055	wish           
	0.0000	aa another     		0.0057	sorry          
	0.0000	aa anyone      		0.0058	miss           
	0.0000	aa apr         		0.0060	great          
	0.0000	aa are         		0.0065	your           
	0.0000	aa artifact    		0.0084	not            
	0.0000	aa as          		0.0085	you            
	0.0000	aa asyik       		0.0117	want           
	0.0000	aa at          		0.0185	sad            
	0.0000	aa baas        		0.0185	thanks     

In [16]:
# Random Forest bigram tf tuning
pipeline = Pipeline([('tf', CountVectorizer(encoding='latin-1')),('rf', RandomForestClassifier(max_depth=64, n_jobs=-1, random_state=0, verbose=1))])
parameters = {
    'tf__max_df': (0.1, 0.2, 1.0),
    'tf__min_df': (5, 10, 20, 1),
    'rf__max_features': ("auto", 0.0001, 0.0005)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 266.0min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min


Best parameters set:
[('tf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=1, ngram_range=(1, 1), preprocessor=None,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)), ('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=64, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_st

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.3min finished


In [18]:
# Random Forest bigram tf best model
y_pred_rf_5, vectorizer_rf_5, clf_rf_5 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', max_df=0.2, min_df=1), RandomForestClassifier(n_estimators=1000, max_depth=64, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 243573) 

X_test_vec shape -  (16000, 243573) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 22.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 52.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.9s finished


Top  20  most and least informative features
	0.0000	aabangan       		0.0074	your           
	0.0000	aabar          		0.0075	sucks          
	0.0000	aabhar         		0.0084	thank          
	0.0000	aabinker       		0.0090	bad            
	0.0000	aabout         		0.0090	work           
	0.0000	aaccee         		0.0095	great          
	0.0000	aacd           		0.0096	sorry          
	0.0000	aach           		0.0100	wish           
	0.0000	aache          		0.0102	hate           
	0.0000	aachei         		0.0108	but            
	0.0000	aachen         		0.0114	sick           
	0.0000	aadam          		0.0131	my             
	0.0000	aadha          		0.0133	good           
	0.0000	aadn           		0.0140	no             
	0.0000	aae            		0.0155	love           
	0.0000	aaeeaa         		0.0167	miss           
	0.0000	aafech         		0.0178	not            
	0.0000	aaffrriiccaa   		0.0198	thanks         
	0.0000	aafter         		0.0221	sad            
	0.0000	aafyh          		0.0288	you        

In [17]:
# Random Forest bigram tf-idf tuning
pipeline = Pipeline([('tfidf', TfidfVectorizer(encoding='latin-1')),('rf', RandomForestClassifier(max_depth=64, n_jobs=-1, random_state=0, verbose=1))])
parameters = {
    'tfidf__max_df': (0.1, 0.2, 1.0),
    'tfidf__min_df': (5, 10, 20, 1),
    'rf__max_features': ("auto", 0.0001, 0.0005)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 63.8min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 249.1min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.6min


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=0.2, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=64, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_sco

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.9min finished


In [19]:
# Random Forest bigram tf-idf best model
y_pred_rf_6, vectorizer_rf_6, clf_rf_6 = model(X_train, y_train, X_test, y_test, TfidfVectorizer(encoding='latin-1', max_df=0.2, min_df=1), RandomForestClassifier(n_estimators=1000, max_depth=64, n_jobs=-1, random_state=0, verbose=1))

X_train_vec shape -  (1584000, 243573) 

X_test_vec shape -  (16000, 243573) 



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 55.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.9s finished


Top  20  most and least informative features
	0.0000	aab            		0.0075	sucks          
	0.0000	aabangan       		0.0084	your           
	0.0000	aabar          		0.0087	great          
	0.0000	aabhar         		0.0092	bad            
	0.0000	aabinker       		0.0092	wish           
	0.0000	aabot          		0.0098	thank          
	0.0000	aabout         		0.0103	sick           
	0.0000	aabt           		0.0105	but            
	0.0000	aacattyisamazing		0.0108	work           
	0.0000	aaccee         		0.0110	hate           
	0.0000	aacchhoo       		0.0117	sorry          
	0.0000	aacck          		0.0130	my             
	0.0000	aacckk         		0.0132	good           
	0.0000	aach           		0.0135	no             
	0.0000	aachar         		0.0142	love           
	0.0000	aache          		0.0160	miss           
	0.0000	aachee         		0.0181	not            
	0.0000	aachei         		0.0211	thanks         
	0.0000	aachen         		0.0225	sad            
	0.0000	aachhoo        		0.0272	you       