In [1]:
import pandas as p
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
%store -r data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [3]:
X = data['clean_text'].values
y = data['target'].values
print(len(X))
print(len(y))

1600000
1600000


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1584000,) (1584000,) (16000,) (16000,)


In [7]:
def model(X_train, y_train, X_test, y_test, vectorizer, clf):
    X_train_vec = vectorizer.fit_transform(X_train)
    print("X_train_vec shape - ", X_train_vec.shape, "\n")
    X_test_vec = vectorizer.transform(X_test)
    print("X_test_vec shape - ", X_test_vec.shape, "\n")
    clf.fit(X_train_vec, y_train)
    print("Accuracy - ", clf.score(X_test_vec, y_test))
    y_pred = clf.predict(X_test_vec)
    return vectorizer, clf, y_pred

In [8]:
# tf unigram
vectorizer1, clf1, y_pred1 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1'), LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1))

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Accuracy -  0.801125


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.9min finished


In [9]:
# tf-idf unigram
vectorizer2, clf2, y_pred2 = model(X_train, y_train, X_test, y_test, TfidfVectorizer(encoding='latin-1'), LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1))

X_train_vec shape -  (1584000, 243575) 

X_test_vec shape -  (16000, 243575) 



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Accuracy -  0.8031875


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.0min finished


In [10]:
# tf bigram
vectorizer3, clf3, y_pred3 = model(X_train, y_train, X_test, y_test, CountVectorizer(encoding='latin-1', ngram_range=(1,2)), LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1))

X_train_vec shape -  (1584000, 3594521) 

X_test_vec shape -  (16000, 3594521) 



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Accuracy -  0.8219375


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 16.5min finished


In [11]:
# tf-idf bigram
vectorizer4, clf4, y_pred4 = model(X_train, y_train, X_test, y_test, TfidfVectorizer(encoding='latin-1', ngram_range=(1,2)), LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1))

X_train_vec shape -  (1584000, 3594521) 

X_test_vec shape -  (16000, 3594521) 



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Accuracy -  0.824125


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  7.2min finished


In [12]:
def printErrors(y_pred, top=10):
    print(*([(X_test[i], y_test[i], y_pred[i]) for i in range(len(y_test)) if y_pred[i] != y_test[i]][:top]), sep='\n')

In [14]:
print("Bigram tf model")
printErrors(y_pred4, top=20)

Bigram tf model
('where the are my pinking shears rarararrarararr babyproofing while cutting stuff makes me stick shears random places forget them', 0, 4)
('reply me pls', 4, 0)
('morning twitter world gonna start my day with the coldest lucozade can find', 4, 0)
('song of my life now your love is lie simple plan beautifulylost', 0, 4)
('watching the last leno so glad got to go once', 0, 4)
('dropped your books off in the library', 4, 0)
('sun burns are noo fun bored sittin at home watching bride wars with my sister have good weekend everyone', 0, 4)
('do more that anything', 0, 4)
('ok was confused for moment have to agree that ur dad sure could ve timed the announcement diff time though', 0, 4)
('my daddy lives in manchester love it there hull bitch have mate there and wanna go', 0, 4)
('oh my jeebus slept all day how wonderful think hungry', 4, 0)
('umm its getting betterr than before but its still pretty bad lol', 4, 0)
('huh turns out like marmite when did stop being loved by all'