# Classification with logistic regression

## Tweets

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [4]:
tweets = pd.read_csv('data/tweets_all.csv')
print(tweets.shape)
tweets.head()

(20007, 9)


Unnamed: 0,id,topic,source,text,replyCount,vaderMean,vaderStd,vaderCatLabel,vaderCat
0,1377384607969013765,,FoxNews,border patrol video shows smugglers abandoning...,108,-0.045958,0.495337,high,1.0
1,1377367836046192641,,FoxNews,gop rep urges hhs to halt reported plan to rel...,80,0.043459,0.495874,high,1.0
2,1377339316616097797,,FoxNews,see it nasas curiosity rover takes mars selfie,17,0.052776,0.322931,low,0.0
3,1377319049487642625,,FoxNews,trump moving forward with plans to start socia...,1147,0.005348,0.451982,high,1.0
4,1377316510939684883,,FoxNews,san diego teachers given option to teach migra...,311,-0.051989,0.46337,high,1.0


In [5]:
tweets.groupby(by='vaderCat').size()

vaderCat
0.0     6611
1.0    13396
dtype: int64

### Pre-processing

In [6]:
def tokenize_text(text, remove_stopwords=True):
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
    
    return text

In [7]:
tweets['tokens'] = tweets['text'].astype(str).apply(tokenize_text)
tweets['tokens']

0        [border, patrol, video, shows, smugglers, aban...
1        [gop, rep, urges, hhs, halt, reported, plan, r...
2        [see, nasas, curiosity, rover, takes, mars, se...
3        [trump, moving, forward, plans, start, social,...
4        [san, diego, teachers, given, option, teach, m...
                               ...                        
20002    [factbox, joe, biden, win, could, mean, financ...
20003    [proud, boys, emboldened, trumps, words, lgbtq...
20004    [biden, makes, campaign, pitch, miamis, little...
20005    [trump, administration, offered, public, confl...
20006    [one, person, killed, two, seriously, injured,...
Name: tokens, Length: 20007, dtype: object

### Feature extraction

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [9]:
training_data, test_data = train_test_split(tweets, train_size = 0.7, random_state=42)
y_tr = training_data['vaderCat']
y_te = test_data['vaderCat']
print(training_data.shape)
print(test_data.shape)

(14004, 10)
(6003, 10)


In [10]:
def feature_extraction(train, test, description):
    '''
    Extracts features for Bag of words, Bag of n-grams 
    or Tf-Idf
    '''
    
    if description == "bow" or description == "tfidf":
        transform = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
    elif description == "ngram":
        transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
        
    if description == "tfidf":
        transform = text.TfidfTransformer(norm=None)
        X_tr = transform.fit_transform(X_tr)
        X_te = transform.transform(X_te) 
        
    print('The shape of the df is:', X_tr.shape)   
    
    return X_tr, X_te, transform

In [11]:
X_tr_bow, X_te_bow, tr_bow = feature_extraction(training_data['tokens'], test_data['tokens'], "bow")
X_tr_ngram, X_te_ngram, tr_ngram = feature_extraction(training_data['tokens'], test_data['tokens'], "ngram")
X_tr_tfidf, X_te_tfidf, tr_tfidf = feature_extraction(training_data['tokens'], test_data['tokens'], "tfidf")

bow : the size of the voc is 18771
The shape of the df is: (14004, 18771)
ngram : the size of the voc is 138801
The shape of the df is: (14004, 138801)
tfidf : the size of the voc is 18771
The shape of the df is: (14004, 18771)


In [12]:
tr_ngram.get_feature_names()[1:30]

['aapi community indeed',
 'aapi hate group',
 'aapisherose trending twitter',
 'aari mcdonald carried',
 'aaron baseball legend',
 'aaron danielson supporter',
 'aaron rodgers hilarious',
 'aaron rodgers packers',
 'aaron rodgers takes',
 'aaron wrote nasty',
 'aback erdogan colleague',
 'aback turkish president',
 'abajo durante cinco',
 'abandon claims u',
 'abandon measures global',
 'abandon talks bold',
 'abandoned apartment blocks',
 'abandoned breakaway project',
 'abandoned campfire driven',
 'abandoned contentious plan',
 'abandoned donkeys pampered',
 'abandoned oil gas',
 'abandoned pets refers',
 'abandoned president trump',
 'abandoned soccer competition',
 'abandoned town pripyat',
 'abandoned vessel adrift',
 'abandoning myanmar gas',
 'abandoning pledge work']

### Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

def grid_logistic_classify(X_tr, y_tr, X_test, y_test, description, param_grid=param_grid_):
    grid = GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_)
    model = grid.fit(X_tr, y_tr)
    score = model.best_estimator_.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [23]:
model_bow = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = simple_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_tr = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.6683325004164584
Test Score with n-grams features 0.6759953356655006
Test Score with tfidf features 0.6358487422955189


In [24]:
test_data.groupby(by='vaderCat').size()

vaderCat
0.0    2003
1.0    4000
dtype: int64

In [25]:
test_data['y_pred'] = model_tfidf_tr.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    1969
1.0    4034
dtype: int64

In [26]:
# Accuracy by category
from sklearn.metrics import confusion_matrix
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 893 1110]
 [1076 2924]]


array([0.44583125, 0.731     ])

### Feature importance

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’.

In [27]:
#class 0
print('most important features')
print('--------> bow')
bow_coef0 = model_bow.coef_[0]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
ngram_coef0 = model_bow_tr.coef_[0]
imp0 = ngram_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])

most important features
--------> bow
['guns', 'hate', 'atlanta', 'wrong', 'chauvin', 'xinjiang', 'cover', 'origin', 'levels', 'cuomos', 'freed', 'slams', 'freedom', 'vast', 'limited', 'regional', 'welcomed', 'fda', 'jacob', 'elite']
--------> ngram
['trial derek chauvin', 'democratic national convention', 'derek chauvin trial', 'judge amy coney', 'first presidential debate', 'new york times', 'president trump said', 'presidential nominee joe', 'joe biden said', 'president trump says', 'year old boy', 'black lives matter', 'president donald trump', 'trump joe biden', 'democratic presidential nominee', 'shooting jacob blake', 'tested positive coronavirus', 'united arab emirates', 'president donald trumps', 'speaker nancy pelosi']


### SMOT

In [28]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [29]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_bow = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_bow.fit(X_tr_bow, y_tr)
score = grid_bow.best_estimator_.score(X_te_bow, y_te)
score

0.6625020822921872

In [30]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_ngram = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_ngram.fit(X_tr_ngram, y_tr)
score = grid_ngram.best_estimator_.score(X_te_ngram, y_te)
score

0.4586040313176745

In [31]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_tfidf = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_tfidf.fit(X_tr_tfidf, y_tr)
score = grid_tfidf.best_estimator_.score(X_te_tfidf, y_te)
score

0.6516741629185407

In [32]:
test_data['y_pred'] = grid_tfidf.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    1990
1.0    4013
dtype: int64

In [33]:
# Accuracy by category tfidf
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 951 1052]
 [1039 2961]]


array([0.47478782, 0.74025   ])

# Grid Search

In [36]:
model_bow = grid_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = grid_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_transform = grid_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.6873230051640846
Test Score with n-grams features 0.6759953356655006
Test Score with tfidf features 0.6898217557887723


In [37]:
model_bow_tr.best_params_

{'C': 1.0}

In [38]:
#model_bow.best_estimator_.coef_

In [39]:
test_data['y_pred'] = model_tfidf_transform.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0     685
1.0    5318
dtype: int64

In [40]:
# Accuracy by category
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 413 1590]
 [ 272 3728]]


array([0.20619071, 0.932     ])