In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('../data/input/train.csv')
test_df = pd.read_csv('../data/input/test.csv')

In [2]:
# tweetテキストの余計な文字を削除
import re

def preprocess_text(sentence):
    # URL
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*", ' ', sentence)
    # Hash Tag
    sentence = re.sub(r'#[^\s]+\s*', ' ', sentence)
    # アルファベット以外
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # 単一文字
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # 連続する空白を1つの空白に
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [3]:
# テキストを前処理
train_df["text"] = train_df["text"].apply(lambda x: preprocess_text(x))
display(train_df.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this May ALLAH For...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,people receive evacuation orders in California,1
4,7,,,Just got sent this photo from Ruby as smoke fr...,1


# Test vectorizers

In [5]:
from sklearn import feature_extraction

count_vectorizer = feature_extraction.text.CountVectorizer()
## we use .todense() here because these vectors are "sparse"
print(train_df["text"][0])
print(tr_count_vec[0].todense())
print(tr_count_vec[0].todense().shape)

Our Deeds are the Reason of this May ALLAH Forgive us all
[[0 0 0 ... 0 0 0]]
(1, 15056)


In [7]:
# [referece](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens)
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings)

100%|███████████████████████████████████████████████████████████████████████████████| 405M/405M [01:38<00:00, 4.13MB/s]


Sentence embeddings:
[[-0.10409481  0.5274764   1.1797733  ... -0.43389145 -0.69452345
   0.5386926 ]
 [-0.13118434 -0.17390312  1.1052182  ...  0.02624456 -0.00269847
   0.9161108 ]
 [-0.74899274  0.71891785 -1.0394565  ...  0.15582623  1.0202509
   0.097904  ]]


# Compare

In [6]:
from sklearn import feature_extraction, linear_model, model_selection
count_vectorizer = feature_extraction.text.CountVectorizer()
tr_count_vec = count_vectorizer.fit_transform(train_df['text'])

clf = linear_model.RidgeClassifier()
scores_count_vec = model_selection.cross_val_score(
    clf, 
    tr_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)
print(f"count vector scores")
for i, scores_count_vec in enumerate(scores_count_vec):print(f"{i}\t{scores_count_vec:.4f}")
print(f"score average: {scores_count_vec.mean():.4f}\n")

count vector scores
0	0.6024
1	0.5295
2	0.6368
score average: 0.6368



In [18]:
from sentence_transformers import SentenceTransformer

tr_bert_vec = model.encode(train_df['text'].values)

scores_bert_vec = model_selection.cross_val_score(
    clf,
    tr_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)
print(f"bert vector scores")
for i, scores_bert_vec in enumerate(scores_bert_vec):print(f"{i}\t{scores_bert_vec:.4f}")
print(f"score average: {scores_bert_vec.mean():.4f}")

bert vector scores
0	0.7133
1	0.6869
2	0.7671
score average: 0.7671
