# Classification with logistic regression

## Tweets

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [18]:
tweets = pd.read_csv('data/tweets_w_scores.csv')
print(tweets.shape)
tweets.head()

(20698, 9)


Unnamed: 0,id,topic,source,text,replyCount,vaderMean,vaderStd,vaderCatLabel,vaderCat
0,1377385383168765952,,FoxNews,activists protest renaming chicago school afte...,306,-0.05283,0.445459,medium,1.0
1,1377384607969013765,,FoxNews,border patrol video shows smugglers abandoning...,108,-0.045958,0.495337,medium,1.0
2,1377384339105669122,,FoxNews,cause of tiger woods car crash determined but ...,169,-0.034919,0.424833,medium,1.0
3,1377367836046192641,,FoxNews,gop rep urges hhs to halt reported plan to rel...,80,0.043459,0.495874,medium,1.0
4,1377358399759785987,,FoxNews,some democrats trying to stop iowa new hampshi...,96,-0.040135,0.433053,medium,1.0


In [17]:
tweets.groupby(by='vaderCat').size()

vaderCat
0.0     5804
1.0    10094
2.0     4800
dtype: int64

### Pre-processing

In [3]:
def tokenize_text(text, remove_stopwords=True):
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
    
    return text

In [4]:
tweets['tokens'] = tweets['text'].astype(str).apply(tokenize_text)
tweets['tokens']

0        [activists, protest, renaming, chicago, school...
1        [border, patrol, video, shows, smugglers, aban...
2        [cause, tiger, woods, car, crash, determined, ...
3        [gop, rep, urges, hhs, halt, reported, plan, r...
4        [democrats, trying, stop, iowa, new, hampshire...
                               ...                        
20693    [u, n, special, envoy, tells, security, counci...
20694    [wisconsin, high, court, voids, governors, mas...
20695    [analysis, biden, infrastructure, plan, bets, ...
20696    [analysis, deliveroos, flop, wake, call, tech,...
20697    [defense, tells, canada, court, huawei, cfos, ...
Name: tokens, Length: 20698, dtype: object

### Feature extraction

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [6]:
training_data, test_data = train_test_split(tweets, train_size = 0.7, random_state=42)
y_tr = training_data['vaderCat']
y_te = test_data['vaderCat']
print(training_data.shape)
print(test_data.shape)

(14488, 10)
(6210, 10)


In [84]:
def feature_extraction(train, test, description):
    '''
    Extracts features for Bag of words, Bag of n-grams 
    or Tf-Idf
    '''
    
    if description == "bow" or description == "tfidf":
        transform = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
    elif description == "ngram":
        transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False)
        X_tr = transform.fit_transform(train)
        X_te = transform.transform(test)
        print(description, ': the size of the voc is', len(transform.vocabulary_))
        
        
    if description == "tfidf":
        transform = text.TfidfTransformer(norm=None)
        X_tr = transform.fit_transform(X_tr)
        X_te = transform.transform(X_te) 
        
    print('The shape of the df is:', X_tr.shape)   
    
    return X_tr, X_te, transform

In [85]:
X_tr_bow, X_te_bow, tr_bow = feature_extraction(training_data['tokens'], test_data['tokens'], "bow")
X_tr_ngram, X_te_ngram, tr_ngram = feature_extraction(training_data['tokens'], test_data['tokens'], "ngram")
X_tr_tfidf, X_te_tfidf, tr_tfidf = feature_extraction(training_data['tokens'], test_data['tokens'], "tfidf")

bow : the size of the voc is 18955
The shape of the df is: (14488, 18955)
ngram : the size of the voc is 134805
The shape of the df is: (14488, 134805)
tfidf : the size of the voc is 18955
The shape of the df is: (14488, 18955)


In [89]:
tr_ngram.get_feature_names()[1:30]

['aapi community indeed',
 'aapi hate group',
 'aardvark local monsters',
 'aari mcdonald carried',
 'aaron donald attacked',
 'aaron donalds attorney',
 'aaron rodgers packers',
 'aaron rodgers still',
 'aaron wrote nasty',
 'ababu negash said',
 'ababu said town',
 'aback find fellow',
 'abajo durante cinco',
 'abandon claims u',
 'abandon measures global',
 'abandoned apartment blocks',
 'abandoned breakaway project',
 'abandoned campfire driven',
 'abandoned contentious plan',
 'abandoned donkeys pampered',
 'abandoned mannequins price',
 'abandoned neighbors fled',
 'abandoned pets refers',
 'abandoned plans build',
 'abandoned push neera',
 'abandoned soccer competition',
 'abandoned three years',
 'abandoned town pripyat',
 'abandoned vessel adrift']

### Logistic Regression

In [165]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(multi_class="multinomial", C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

def grid_logistic_classify(X_tr, y_tr, X_test, y_test, description, param_grid=param_grid_):
    grid = sklearn.model_selection.GridSearchCV(LogisticRegression(multi_class="multinomial"), cv=5, param_grid=param_grid_)
    model = grid.fit(X_tr, y_tr)
    score = model.best_estimator_.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [166]:
model_bow = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = simple_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_tr = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.46296296296296297
Test Score with n-grams features 0.4877616747181965
Test Score with tfidf features 0.42962962962962964


In [167]:
test_data.groupby(by='vaderCat').size()

vaderCat
0.0    1722
1.0    3068
2.0    1420
dtype: int64

In [168]:
test_data['y_pred'] = model_tfidf_tr.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    1717
1.0    3041
2.0    1452
dtype: int64

In [169]:
# Accuracy by category
from sklearn.metrics import confusion_matrix
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 622  748  352]
 [ 766 1624  678]
 [ 329  669  422]]


array([0.3612079 , 0.52933507, 0.2971831 ])

### Feature importance

In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’.

In [144]:
#class 0
print('most important features category 0')
print('--------> bow')
bow_coef0 = model_bow.coef_[0]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
ngram_coef0 = model_bow_tr.coef_[0]
imp0 = ngram_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])

most important features category 0
--------> bow
['expelled', 'trans', 'pull', 'sparked', 'distancing', 'prayers', 'snapped', 'rally', 'switzerland', 'seed', 'urging', 'placebo', 'massing', 'chains', 'drives', 'ontario', 'surviving', 'resolve', 'void', 'grandfather']
--------> ngram
['pfizer covid vaccine', 'icymi jenny narumi', 'winners bafta film', 'bafta film awards', 'indias daily covid', 'new mainland covid', 'mainland covid cases', 'reports new mainland', 'icymi greenland huge', 'covid cases hours', 'suez canal says', 'icymi microsoft talks', 'india reports record', 'tells fell horse', 'putin tells fell', 'biontech covid vaccine', 'icymi russias rugball', 'people intensive care', 'million first time', 'international space station']


In [146]:
#class 1
print('most important features category 0')
print('--------> bow')
bow_coef0 = model_bow.coef_[1]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
bow_coef0 = model_bow_tr.coef_[1]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])

most important features category 0
--------> bow
['driver', 'reactions', 'contaminated', 'filibuster', 'gop', 'pepper', 'cooking', 'surges', 'existing', 'bbc', 'narrow', 'quarantine', 'slams', 'seeing', 'aspiring', 'afternoon', 'prisons', 'disperse', 'might', 'kyodo']
--------> ngram
['breaking news u', 'gov andrew cuomo', 'factbox reactions prince', 'reactions prince harry', 'mainland china reports', 'election reform bill', 'president donald trump', 'secretary state blinken', 'icymi nike sued', 'bitcoin rises percent', 'need know coronavirus', 'watch feeling lonely', 'icymi indias northern', 'icymi harley davidson', 'icymi nearly million', 'watch india ready', 'meghan duchess sussex', 'watch surging demand', 'shooting daunte wright', 'icymi hundreds german']


In [148]:
#class 2
print('most important features category 0')
print('--------> bow')
bow_coef0 = model_bow.coef_[2]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_bow.get_feature_names()
print([features[index] for index in imp0])
print('--------> ngram')
bow_coef0 = model_bow_tr.coef_[2]
imp0 = bow_coef0.argsort()[-20:][::-1]
features = tr_ngram.get_feature_names()
print([features[index] for index in imp0])

most important features category 0
--------> bow
['aged', 'love', 'frustration', 'rd', 'apologises', 'mayorkas', 'programme', 'firms', 'kurdish', 'invite', 'quad', 'sagawards', 'cannon', 'disaster', 'govt', 'extremism', 'counted', 'continued', 'crackdown', 'serve']
--------> ngram
['watch president joe', 'derek chauvin trial', 'icymi berlin zoo', 'icymi year old', 'derek chauvins trial', 'near kings palace', 'helicopter crash sunday', 'todays great read', 'behold war tuba', 'experience entire series', 'says hong kong', 'saudi crown prince', 'watch heres look', 'pro kurdish party', 'italy recommends astrazeneca', 'recommends astrazeneca covid', 'icymi former bachelor', 'records new covid', 'daily record covid', 'china announces sanctions']


### SMOT

In [72]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [75]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_bow = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_bow.fit(X_tr_bow, y_tr)
score = grid_bow.best_estimator_.score(X_te_bow, y_te)
score

0.4613526570048309

In [76]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_ngram = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_ngram.fit(X_tr_ngram, y_tr)
score = grid_ngram.best_estimator_.score(X_te_ngram, y_te)
score

0.3404186795491143

In [77]:
#BOW
pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(multi_class="multinomial", random_state=0))
#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [0.01, 0.1, 1, 10, 100],
          'logisticregression__solver': ['lbfgs']}

grid_tfidf = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=10,
                    return_train_score=True,
                    #scoring= ['accuracy', 'precision', 'recall'],
                    #refit = 'recall'
                     )

grid_tfidf.fit(X_tr_tfidf, y_tr)
score = grid_tfidf.best_estimator_.score(X_te_tfidf, y_te)
score

0.4454106280193237

In [78]:
test_data['y_pred'] = grid_tfidf.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    1733
1.0    3027
2.0    1450
dtype: int64

In [79]:
# Accuracy by category tfidf
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 658  730  334]
 [ 738 1661  669]
 [ 337  636  447]]


array([0.38211382, 0.54139505, 0.31478873])

# Grid Search

In [159]:
model_bow = grid_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_bow_tr = grid_logistic_classify(X_tr_ngram, y_tr, X_te_ngram, y_te, 'n-grams')
model_tfidf_transform = grid_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test Score with bow features 0.49404186795491145
Test Score with n-grams features 0.49404186795491145
Test Score with tfidf features 0.5027375201288244


In [160]:
model_bow_tr.best_params_

{'C': 1e-05}

In [63]:
#model_bow.best_estimator_.coef_

In [65]:
test_data['y_pred'] = model_tfidf_transform.best_estimator_.predict(X_te_tfidf)
test_data.groupby(by='y_pred').size()

y_pred
0.0    1013
1.0    4745
2.0     452
dtype: int64

In [66]:
# Accuracy by category
#test_data.loc[test_data['vaderCat'] != test_data['y_pred']].groupby(by='vaderCat').size()
matrix = confusion_matrix(y_te, test_data['y_pred'])
print(matrix)
matrix.diagonal()/matrix.sum(axis=1)

[[ 443 1212   67]
 [ 382 2490  196]
 [ 188 1043  189]]


array([0.257259  , 0.81160365, 0.13309859])