In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, make_scorer, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

def strip_punct(s):
    s = re.sub('[^A-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

In [None]:
ls ../../data/

# Read Data

In [2]:
train_data = pd.read_csv('../../data/train.csv')
train_data = train_data.fillna(0)
train_data.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1
1,4,0,0,Forest fire near La Ronge Sask. Canada,1


In [3]:
train_text = np.array(train_data.text.apply(lambda x: strip_punct(x)).tolist())
y_train = np.array(train_data.target.tolist())
Counter(y_train)

Counter({1: 3271, 0: 4342})

# LR feature based

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), analyzer='word', min_df = 1, token_pattern='(?u)\\b\\w+\\b')
vectorizer.fit(train_text)
X_train = vectorizer.transform(train_text)

tuned_parameters = {'penalty' : ['l1', 'l2'],
                    'C' : np.logspace(-4, 4, 20),
                    'solver' : ['liblinear', 'lbfgs']}

lr = LogisticRegression()
clf = GridSearchCV(lr, tuned_parameters, cv=5, scoring='f1')
clf.fit(X_train, y_train)
print("Best parameters set found on the train set:")
print(clf.best_params_)

In [None]:
kf = StratifiedKFold(n_splits=5)
y_trainCv, y_predCv =  list(), list()

for train_index, test_index in tqdm(kf.split(train_text, y_train)):
    tr_text, ts_text = train_text[train_index], train_text[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    y_trainCv.extend(y_ts)
    
    vectorizer = TfidfVectorizer(ngram_range=(1,1), analyzer='word', min_df = 1, token_pattern='(?u)\\b\\w+\\b')
    vectorizer.fit(tr_text)
    X_tr = vectorizer.transform(tr_text)
    X_ts = vectorizer.transform(ts_text)

    clf = LogisticRegression(C=1, penalty='l2', solver='liblinear')
#    clf = SVC()
    clf.fit(X_tr, y_tr)
    predictions = clf.predict(X_ts)
    y_predCv.extend(predictions)

print(classification_report(y_trainCv, y_predCv, digits=3, zero_division=False))
print('F1: {:.3f}'.format(f1_score(y_trainCv, y_predCv, zero_division=False)))

# Transformer Embeddings

In [4]:
#max len of the tweets in tokens
lens = list()
for s in train_text:
    lens.append(len(s.split()))
max(lens)

34

In [5]:
train_text = np.array(train_data.text.tolist()) # Try without cleaning
#train_text

In [None]:
#%%time
from transformers import pipeline

feature_extraction = pipeline('feature-extraction', model="bert-base-cased", tokenizer="bert-base-cased", device=-1) # device=-1 for CPU, device=0 for GPU
X_train = list()
for sentence in tqdm(train_text):
    features = feature_extraction(sentence)
    X_train.append(features[0][0])

#X_train

In [6]:
#%%time
from transformers import pipeline

feature_extraction = pipeline('feature-extraction', model="roberta-large", tokenizer="roberta-large", device=-1) # device=-1 for CPU, device=0 for GPU
X_train = list()
for sentence in tqdm(train_text):
    features = feature_extraction(sentence)
    X_train.append(features[0][0])

#X_train

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




100%|██████████| 7613/7613 [23:26<00:00,  5.41it/s]


In [None]:
%%time
tuned_parameters = {'penalty' : ['l1', 'l2'],
                    'C' : np.logspace(-4, 4, 20),
                    'solver' : ['liblinear', 'lbfgs']}

lr = LogisticRegression()
clf = GridSearchCV(lr, tuned_parameters, cv=5, scoring='f1')
clf.fit(X_train, y_train)
print("Best parameters set found on the train set:")
print(clf.best_params_)

In [7]:
kf = StratifiedKFold(n_splits=5)
y_trainCv, y_predCv =  list(), list()

for train_index, test_index in tqdm(kf.split(np.array(X_train), y_train)):
    X_tr, X_ts = np.array(X_train)[train_index], np.array(X_train)[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    y_trainCv.extend(y_ts)
    
    clf = LogisticRegression(C=1, penalty='l2', solver='liblinear')
#    clf = SVC()
    clf.fit(X_tr, y_tr)
    predictions = clf.predict(X_ts)
    y_predCv.extend(predictions)

print(classification_report(y_trainCv, y_predCv, digits=3, zero_division=False))
print('F1: {:.3f}'.format(f1_score(y_trainCv, y_predCv, zero_division=False)))

5it [00:12,  2.56s/it]

              precision    recall  f1-score   support

           0      0.808     0.886     0.845      4342
           1      0.826     0.721     0.770      3271

    accuracy                          0.815      7613
   macro avg      0.817     0.803     0.808      7613
weighted avg      0.816     0.815     0.813      7613

F1: 0.770





# Example

In [None]:
from transformers import pipeline, AutoTokenizer

# direct encoding
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
encoded_seq = tokenizer.encode("roberta is encoded")

# feature extraction
feature_extraction = pipeline('feature-extraction', model="roberta-base", tokenizer="roberta-base", device=-1)
features = feature_extraction("roberta is encoded")

In [None]:
encoded_seq

In [None]:
len(features[0][0])