In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

train_df = pd.read_csv('../data/input/train.csv')
test_df = pd.read_csv('../data/input/test.csv')

In [2]:
# tweetテキストの余計な文字を削除
import re

def preprocess_text(sentence):
    # URL
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*", ' ', sentence)
    # Hash Tag
    sentence = re.sub(r'#[^\s]+\s*', ' ', sentence)
    # アルファベット以外
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # 単一文字
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # 連続する空白を1つの空白に
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [3]:
# テキストを前処理
train_df["text"] = train_df["text"].apply(lambda x: preprocess_text(x))
display(train_df.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this May ALLAH For...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,people receive evacuation orders in California,1
4,7,,,Just got sent this photo from Ruby as smoke fr...,1


# Test vectorizers

In [22]:
# Count Vectorizer
from sklearn import feature_extraction

count_vectorizer = feature_extraction.text.CountVectorizer()
count_vec = count_vectorizer.fit_transform(train_df["text"])
## we use .todense() here because these vectors are "sparse"
print(train_df["text"][0])
print(count_vec.todense())
print(count_vec.todense().shape)

Our Deeds are the Reason of this May ALLAH Forgive us all
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(7613, 15056)


In [21]:
# Tf-Idf Vectorizer(Tf-Idf transformer)
# TfidfTransformer is used on an existing count matrix such as one returned by CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

tfidf_transformer = TfidfTransformer()
tfidf_vectorizer = TfidfVectorizer()

tfidf_trans = tfidf_transformer.fit_transform(count_vec)
tfidf_vec = tfidf_vectorizer.fit_transform(train_df["text"])

print("tfidf_transformer:\n", tfidf_trans.todense())
print("\ntfidf_vectorizer:\n", tfidf_vec.todense())

tfidf_transformer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

tfidf_vectorizer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
# BERT vectorize
# [referece](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens)
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
[[-0.10409481  0.5274764   1.1797733  ... -0.43389145 -0.69452345
   0.5386926 ]
 [-0.13118434 -0.17390312  1.1052182  ...  0.02624456 -0.00269847
   0.9161108 ]
 [-0.74899274  0.71891785 -1.0394565  ...  0.15582623  1.0202509
   0.097904  ]]


# Compare vectors
using classifier:Ridge

In [27]:
from sklearn import linear_model
from sklearn import feature_extraction, model_selection
from sentence_transformers import SentenceTransformer

clf = linear_model.RidgeClassifier()

# Count Vector
count_vectorizer = feature_extraction.text.CountVectorizer()
tr_count_vec = count_vectorizer.fit_transform(train_df['text'])

scores_count_vec = model_selection.cross_val_score(
    clf, 
    tr_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# Tf-Idf Vector
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()
tr_tfidf_vec = tfidf_vectorizer.fit_transform(train_df['text'])

scores_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# sentence Transformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
tr_bert_vec = model.encode(train_df['text'].values)

scores_bert_vec = model_selection.cross_val_score(
    clf,
    tr_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

In [29]:
print(f"count vector scores")
for i, score in enumerate(scores_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_count_vec.mean():.4f}\n")

print(f"tf-idf vector scores")
for i, score in enumerate(scores_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_tfidf_vec.mean():.4f}\n")

print(f"bert vector scores")
for i, score in enumerate(scores_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_bert_vec.mean():.4f}")

count vector scores
0	0.6388
1	0.5973
2	0.6667
score average: 0.6343

tf-idf vector scores
0	0.6388
1	0.5973
2	0.6667
score average: 0.6343

bert vector scores
0	0.7133
1	0.6869
2	0.7671
score average: 0.7225


# Compare vectors
using classifier:SVM

In [33]:
from sklearn.svm import SVC
from sklearn import feature_extraction, model_selection
from sentence_transformers import SentenceTransformer

clf = SVC(C=1e-1)

# Count Vector
count_vectorizer = feature_extraction.text.CountVectorizer()
tr_count_vec = count_vectorizer.fit_transform(train_df['text'])

scores_count_vec = model_selection.cross_val_score(
    clf, 
    tr_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# Tf-Idf Vector
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()
tr_tfidf_vec = tfidf_vectorizer.fit_transform(train_df['text'])

scores_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# sentence Transformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
tr_bert_vec = model.encode(train_df['text'].values)

scores_bert_vec = model_selection.cross_val_score(
    clf,
    tr_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

In [36]:
print(f"count vector scores")
for i, score in enumerate(scores_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_count_vec.mean():.4f}\n")

print(f"tf-idf vector scores")
for i, score in enumerate(scores_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_tfidf_vec.mean():.4f}\n")

print(f"bert vector scores")
for i, score in enumerate(scores_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_bert_vec.mean():.4f}")

count vector scores
0	0.1967
1	0.2373
2	0.2494
score average: 0.2278

tf-idf vector scores
0	0.0182
1	0.0037
2	0.0182
score average: 0.0133

bert vector scores
0	0.7246
1	0.6976
2	0.7703
score average: 0.7309


# Preprocess
stemming, remove stop words

In [26]:
# Test
# gensim preprocessing(stemming&remove stop words)
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string

print(train_df["text"][0])
print(preprocess_documents(train_df['text'])[0])
preprocess_doc = train_df["text"].apply(lambda x:preprocess_string(x))
print(preprocess_doc)

Our Deeds are the Reason of this May ALLAH Forgive us all
['deed', 'reason', 'allah', 'forgiv']
0                           [deed, reason, allah, forgiv]
1                      [forest, near, rong, sask, canada]
2       [resid, ask, shelter, place, notifi, offic, ev...
3               [peopl, receiv, evacu, order, california]
4           [got, sent, photo, rubi, smoke, pour, school]
                              ...                        
7608    [giant, crane, hold, bridg, collaps, nearbi, h...
7609    [aria, ahrari, thetawniest, control, wild, fir...
7610                               [utc, volcano, hawaii]
7611    [polic, investig, bike, collid, car, littl, po...
7612    [latest, home, raze, northern, california, wil...
Name: text, Length: 7613, dtype: object


In [28]:
from gensim.parsing.preprocessing import preprocess_string
from sklearn import linear_model, feature_extraction, model_selection
from sentence_transformers import SentenceTransformer

texts = train_df["text"].apply(lambda x:preprocess_string(x))

clf = linear_model.RidgeClassifier()

# Count Vector
count_vectorizer = feature_extraction.text.CountVectorizer()
tr_count_vec = count_vectorizer.fit_transform(train_df['text'])

scores_count_vec = model_selection.cross_val_score(
    clf, 
    tr_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# Tf-Idf Vector
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()
tr_tfidf_vec = tfidf_vectorizer.fit_transform(train_df['text'])

scores_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# sentence Transformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
tr_bert_vec = model.encode(train_df['text'].values)

scores_bert_vec = model_selection.cross_val_score(
    clf,
    tr_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

In [29]:
print(f"count vector scores")
for i, score in enumerate(scores_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_count_vec.mean():.4f}\n")

print(f"tf-idf vector scores")
for i, score in enumerate(scores_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_tfidf_vec.mean():.4f}\n")

print(f"bert vector scores")
for i, score in enumerate(scores_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_bert_vec.mean():.4f}")

count vector scores
0	0.6024
1	0.5295
2	0.6368
score average: 0.5896

tf-idf vector scores
0	0.6388
1	0.5973
2	0.6667
score average: 0.6343

bert vector scores
0	0.7133
1	0.6869
2	0.7671
score average: 0.7225


In [11]:
#[reference](https://www.ogis-ri.co.jp/otc/hiroba/technical/similar-document-search/part9.html)
import transformers
from sentence_transformers import models, SentenceTransformer
from sentence_transformers.losses import TripletDistanceMetric, MSELoss
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sentence_transformers.readers import TripletReader
from sentence_transformers.datasets import SentencesDataset
from torch.utils.data import DataLoader

transformer = models.BERT('bert-base-uncased')

pooling = models.Pooling(
    transformer.get_word_embedding_dimension(), 
    pooling_mode_mean_tokens=True, 
    pooling_mode_cls_token=False, 
    pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[transformer, pooling])