# Classification with logistic regression

## Tweets

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [2]:
tweets = pd.read_csv('data/tweets_all.csv')
print(tweets.shape)
tweets.head()

(34731, 7)


Unnamed: 0,id,topic,source,text,replyCount,vaderCatLabel,vaderCat
0,1377385383168765952,,FoxNews,activists protest renaming chicago school afte...,306,high,1.0
1,1377384607969013765,,FoxNews,border patrol video shows smugglers abandoning...,108,high,1.0
2,1377384339105669122,,FoxNews,cause of tiger woods car crash determined but ...,169,low,0.0
3,1377367836046192641,,FoxNews,gop rep urges hhs to halt reported plan to rel...,80,high,1.0
4,1377358399759785987,,FoxNews,some democrats trying to stop iowa new hampshi...,96,high,1.0


In [3]:
tweets.groupby(by='vaderCat').size()

vaderCat
0.0    15062
1.0    19669
dtype: int64

### Pre-processing

In [4]:
def tokenize_text(text, remove_stopwords=True):
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
    
    return text

In [5]:
tweets['tokens'] = tweets['text'].astype(str).apply(tokenize_text)
tweets['tokens']

0        [activists, protest, renaming, chicago, school...
1        [border, patrol, video, shows, smugglers, aban...
2        [cause, tiger, woods, car, crash, determined, ...
3        [gop, rep, urges, hhs, halt, reported, plan, r...
4        [democrats, trying, stop, iowa, new, hampshire...
                               ...                        
34726    [much, share, trump, celebrates, acquittal, se...
34727    [senate, majority, leader, chuck, schumer, bla...
34728    [breaking, u, senate, acquits, trump, seven, r...
34729    [outcome, clear, final, verdict, rendered, rep...
34730                         [breaking, trump, acquitted]
Name: tokens, Length: 34731, dtype: object

### Feature extraction

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [7]:
training_data, test_data = train_test_split(tweets, train_size = 0.7, random_state=42)
y_tr = training_data['vaderCat']
y_te = test_data['vaderCat']
print(training_data.shape)
print(test_data.shape)

(24311, 8)
(10420, 8)


In [8]:
def feature_extraction(train, test, description):
    '''
    Extracts features for Bag of words, Bag of n-grams 
    or Tf-Idf
    '''
    
    if description == "bow" or description == "tfidf":
        transform = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
    elif description == "ngram":
        transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
        
    if description == "tfidf":
        transform = text.TfidfTransformer(norm=None)
        X_tr = transform.fit_transform(X_tr)
        X_te = transform.transform(X_te) 
        
    print('The shape of the df is:', X_tr.shape)   
    
    return X_tr, X_te, transform

In [9]:
X_tr_bow, X_te_bow, tr_bow = feature_extraction(training_data['tokens'], test_data['tokens'], "bow")
X_tr_ngram, X_te_ngram, tr_ngram = feature_extraction(training_data['tokens'], test_data['tokens'], "ngram")
X_tr_tfidf, X_te_tfidf, tr_tfidf = feature_extraction(training_data['tokens'], test_data['tokens'], "tfidf")

bow : the size of the voc is 23638
The shape of the df is: (24311, 23638)
ngram : the size of the voc is 261332
The shape of the df is: (24311, 261332)
tfidf : the size of the voc is 23638
The shape of the df is: (24311, 23638)


### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

def grid_logistic_classify(X_tr, y_tr, X_test, y_test, description, param_grid=param_grid_):
    grid = GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_)
    model = grid.fit(X_tr, y_tr)
    score = model.best_estimator_.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [12]:
model_bow = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = simple_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_tr = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.6108445297504799
Test Score with n-grams features 0.5938579654510556
Test Score with tfidf features 0.572168905950096


In [13]:
test_data.groupby(by='vaderCat').size()

vaderCat
0.0    4496
1.0    5924
dtype: int64

In [14]:
test_data['y_pred'] = model_tfidf_tr.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    4524
1.0    5896
dtype: int64

In [15]:
# Accuracy by category
from sklearn.metrics import confusion_matrix
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[2281 2215]
 [2243 3681]]


array([0.50733986, 0.6213707 ])

### Feature importance

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’.

In [16]:
#class 0
print('most important features')
print('--------> bow')
bow_coef0 = model_bow.coef_[0]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
ngram_coef0 = model_bow_tr.coef_[0]
imp0 = ngram_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])

most important features
--------> bow
['towns', 'everybody', 'sending', 'equality', 'pardons', 'dismissal', 'blake', 'significantly', 'moratorium', 'referring', 'panther', 'anticipation', 'tank', 'fedex', 'desantis', 'soft', 'entering', 'combination', 'vatican', 'midwest']
--------> ngram
['impeach president trump', 'house impeachment managers', 'derek chauvin trial', 'joe bidens election', 'breaking news president', 'donald trump said', 'joe bidens win', 'black lives matter', 'national security law', 'opinion joe bidens', 'former president trump', 'us mexico border', 'writes harry enten', 'president trump says', 'meghan duchess sussex', 'capitol police officer', 'tested positive coronavirus', 'president trump said', 'president trumps campaign', 'first presidential debate']


### SMOT

In [17]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [18]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_bow = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_bow.fit(X_tr_bow, y_tr)
score = grid_bow.best_estimator_.score(X_te_bow, y_te)
score

0.6271593090211133

In [19]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_ngram = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_ngram.fit(X_tr_ngram, y_tr)
score = grid_ngram.best_estimator_.score(X_te_ngram, y_te)
score

0.5147792706333973

In [20]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_tfidf = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_tfidf.fit(X_tr_tfidf, y_tr)
score = grid_tfidf.best_estimator_.score(X_te_tfidf, y_te)
score

0.5995201535508637

In [21]:
test_data['y_pred'] = grid_tfidf.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    4697
1.0    5723
dtype: int64

In [22]:
# Accuracy by category tfidf
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[2510 1986]
 [2187 3737]]


array([0.55827402, 0.63082377])

# Grid Search to tune parameters

In [23]:
model_bow = grid_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = grid_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_transform = grid_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.6385796545105566
Test Score with n-grams features 0.5938579654510556
Test Score with tfidf features 0.6458733205374281


In [28]:
model_tfidf_transform.best_params_

{'C': 0.001}

In [25]:
#model_bow.best_estimator_.coef_

In [34]:
test_data['y_pred'] = model_tfidf_transform.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    3606
1.0    6814
dtype: int64

In [27]:
# Accuracy by category
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[2206 2290]
 [1400 4524]]


array([0.49065836, 0.76367319])

In [50]:
from sklearn.metrics import f1_score

f1_score(test_data['y_pred'], y_te)

0.7103155911446065

# Feature importance best models

In [57]:
#class 0
print('most important features')
print('--------> bow')
bow_coef0 = abs(model_bow.best_estimator_.coef_[0])
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
ngram_coef0 = abs(model_bow_tr.best_estimator_.coef_[0])
imp0 = ngram_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])
print('--------> tfidf')
ngram_coef0 = abs(model_tfidf_transform.best_estimator_.coef_[0])
imp0 = ngram_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])

most important features
--------> bow
['opinion', 'meghan', 'trumps', 'trump', 'pardons', 'book', 'nuclear', 'sending', 'challenges', 'offices', 'violent', 'arrested', 'amid', 'reporting', 'saturday', 'original', 'ship', 'dog', 'vatican', 'recount']
--------> ngram
['watch president elect', 'impeach president trump', 'house impeachment managers', 'sputnik v vaccine', 'derek chauvin trial', 'icymi president joe', 'stuck suez canal', 'joe bidens election', 'european super league', 'breaking news president', 'donald trump said', 'joe bidens win', 'five things watch', 'black lives matter', 'national security law', 'breaking joe biden', 'british prime minister', 'indias daily covid', 'biden infrastructure plan', 'opinion joe bidens']
--------> tfidf
['trump', 'trumps', 'opinion', 'biden', 'president', 'us', 'meghan', 'arrested', 'writes', 'amid', 'analysis', 'impeachment', 'democrats', 'icymi', 'book', 'republicans', 'nuclear', 'bidens', 'pardons', 'challenges']
