In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

train_df = pd.read_csv('../data/input/train.csv')

In [15]:
# tweetテキストの余計な文字を削除
import re

def preprocess_text(sentence):
    # URL
    sentence = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+\s*", ' ', sentence)
    # Hash Tag
    #sentence = re.sub(r'#[^\s]+\s*', ' ', sentence)
    # アルファベット以外
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # 単一文字
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # 連続する空白を1つの空白に
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [56]:
import re
import gensim.parsing.preprocessing as gsp

stop_words = list(gsp.STOPWORDS)
stop_words.remove('fire')

def gensim_preprocess(sentence):
    sentence = sentence.lower()# 小文字化
    sentence = re.sub(r'(https?://[a-zA-Z0-9.-]*)', ' ', sentence)# URL除去
    #sentence = gsp.strip_tags(sentence)# HTMLタグ除去
    sentence = gsp.strip_punctuation(sentence)# 句読点を空白に変える
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)# アルファベット以外
    sentence = gsp.strip_multiple_whitespaces(sentence)# 複数空白、タブ、改行を1つの空白にする
    sentence = gsp.strip_numeric(sentence)# 数字を除く
    sentence = ' '.join(word for word in sentence.split(' ') if word not in stop_words)# stop words
    sentence = gsp.strip_short(sentence, minsize=3)# 指定した長さ(deault:3)より短い単語を除く
    sentence = gsp.stem_text(sentence) # 単語から語幹を除く
        
    return sentence

In [57]:
# テキストを前処理
train_re = train_df.copy()
train_gen = train_df.copy()

train_re["text"] = train_re["text"].apply(lambda x: preprocess_text(x))
train_gen["text"] = train_gen["text"].apply(lambda x: gensim_preprocess(x))
display(train_re.head())
display(train_gen.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,people receive wildfires evacuation orders in...,1
4,7,,,Just got sent this photo from Ruby Alaska as s...,1


Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak allah forgiv,1
1,4,,,forest fire near rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1


# Test vectorizers

In [58]:
# Count Vectorizer
from sklearn import feature_extraction

count_vectorizer = feature_extraction.text.CountVectorizer()
tr_re_count_vec = count_vectorizer.fit_transform(train_re["text"])
tr_gen_count_vec = count_vectorizer.fit_transform(train_gen["text"])
## we use .todense() here because these vectors are "sparse"
print(train_df["text"][0])
print("re preprocessed train data shape:", tr_re_count_vec.todense().shape)
print(tr_re_count_vec.todense())
print("gensim preprocessed train data shape:", tr_gen_count_vec.todense().shape)
print(tr_gen_count_vec.todense())

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
re preprocessed train data shape: (7613, 16191)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
gensim preprocessed train data shape: (7613, 18280)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [59]:
# Tf-Idf Vectorizer(Tf-Idf transformer)
# TfidfTransformer is used on an existing count matrix such as one returned by CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

tfidf_transformer = TfidfTransformer()
tfidf_vectorizer = TfidfVectorizer()

tfidf_re_trans = tfidf_transformer.fit_transform(tr_re_count_vec)
tfidf_gen_trans = tfidf_transformer.fit_transform(tr_gen_count_vec)
tr_re_tfidf_vec = tfidf_vectorizer.fit_transform(train_re["text"])
tr_gen_tfidf_vec = tfidf_vectorizer.fit_transform(train_gen["text"])

print("tfidf_re_transformer:\n", tfidf_re_trans.todense())
print("\ntfidf_gen_transformer:\n", tfidf_gen_trans.todense())

print("\nre preprocessed data shape:", tr_re_tfidf_vec.shape)
print("tfidf_re_vectorizer:\n", tr_re_tfidf_vec.todense())

print("\ngensim preprocessed data shape:", tr_gen_tfidf_vec.shape)
print("tfidf_gen_vectorizer:\n", tr_gen_tfidf_vec.todense())

tfidf_re_transformer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

tfidf_gen_transformer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

re preprocessed data shape: (7613, 16191)
tfidf_re_vectorizer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

gensim preprocessed data shape: (7613, 18280)
tfidf_gen_vectorizer:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [60]:
# BERT vectorize
# [referece](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens)
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
tr_re_bert_vec = model.encode(train_re['text'])
tr_gen_bert_vec = model.encode(train_gen['text'])

print("Sentence embeddings:")
print("re preprocessed data shape:", tr_re_bert_vec.shape)
print(tr_re_bert_vec)

print("\ngensim preprocessed data shape:", tr_gen_bert_vec.shape)
print(tr_gen_bert_vec)

Sentence embeddings:
re preprocessed data shape: (7613, 768)
[[-0.09965673  1.0299886   0.85591394 ...  0.17161681 -0.75642484
   0.27112168]
 [-0.06845954  0.9203234   0.1360512  ... -0.61632764  0.07919553
   0.2655284 ]
 [-0.16675152  0.73107177  2.1624925  ...  0.44667816 -0.89984876
  -0.173441  ]
 ...
 [ 0.17871219  0.25007972  0.64667475 ... -1.1354394   0.4887453
   0.2994783 ]
 [-0.46670738 -0.23355019 -0.33531058 ... -0.5660839  -0.8959852
  -0.34056386]
 [ 0.63955176  0.12620305  0.4097528  ... -0.7496225   0.9582164
  -0.6429042 ]]

gensim preprocessed data shape: (7613, 768)
[[ 0.06911088  0.78569794  0.97117573 ... -0.44359168 -0.5179217
   0.39473927]
 [ 0.01331958  0.91980946  0.24482374 ... -0.53480923  0.13305727
   0.38670892]
 [ 0.1850915   0.34008476  1.6238809  ...  0.3299464  -1.015084
   0.33661366]
 ...
 [ 0.275324    0.06075098  0.8673379  ... -0.69132257  0.43372035
   0.13150543]
 [ 0.21746318  0.18880795  0.5193     ... -0.7530516  -1.0929471
  -0.48772135]

# Compare vectors
using classifier:Ridge

In [61]:
from sklearn import linear_model
from sklearn import feature_extraction, model_selection
from sentence_transformers import SentenceTransformer

clf = linear_model.RidgeClassifier()

# Count Vector
scores_re_count_vec = model_selection.cross_val_score(
    clf, 
    tr_re_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

scores_gen_count_vec = model_selection.cross_val_score(
    clf, 
    tr_gen_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# Tf-Idf Vector
scores_re_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_re_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

scores_gen_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_gen_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# sentence Transformer
scores_re_bert_vec = model_selection.cross_val_score(
    clf,
    tr_re_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

scores_gen_bert_vec = model_selection.cross_val_score(
    clf,
    tr_gen_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

print("count vector scores(re)")
for i, score in enumerate(scores_re_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_re_count_vec.mean():.4f}")
print("count vector scores(gensim)")
for i, score in enumerate(scores_gen_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_gen_count_vec.mean():.4f}")

print("\ntf-idf vector scores(re)")
for i, score in enumerate(scores_re_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_re_tfidf_vec.mean():.4f}")
print("tf-idf vector scores(gensim)")
for i, score in enumerate(scores_gen_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_gen_tfidf_vec.mean():.4f}")

print("\nbert vector scores(re)")
for i, score in enumerate(scores_re_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_re_bert_vec.mean():.4f}")
print("bert vector scores(gensim)")
for i, score in enumerate(scores_gen_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_gen_bert_vec.mean():.4f}")

count vector scores(re)
0	0.6098
1	0.5595
2	0.6203
score average: 0.5966
count vector scores(gensim)
0	0.5869
1	0.5491
2	0.6086
score average: 0.5815

tf-idf vector scores(re)
0	0.6330
1	0.6037
2	0.6812
score average: 0.6393
tf-idf vector scores(gensim)
0	0.5987
1	0.5748
2	0.6370
score average: 0.6035

bert vector scores(re)
0	0.7289
1	0.7091
2	0.7642
score average: 0.7341
bert vector scores(gensim)
0	0.6748
1	0.6422
2	0.7245
score average: 0.6805


# Preprocess
stemming, remove stop words

In [29]:
# Test
# gensim preprocessing(stemming&remove stop words)
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string

print("original:\n ", train_df["text"][0])
print("stemming, remove stop words:\n ", ' '.join(preprocess_documents(train_df['text'])[0]))
preprocess_doc = train_df["text"].apply(lambda x:' '.join(preprocess_string(x)))
print("preprocessed train text:\n", preprocess_doc)

original:
  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
stemming, remove stop words:
  deed reason earthquak allah forgiv
preprocessed train text:
 0                      deed reason earthquak allah forgiv
1                            forest near rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    giant crane hold bridg collaps nearbi home htt...
7609    aria ahrari thetawniest control wild fire cali...
7610                    utc volcano hawaii http zdtoydebj
7611    polic investig bike collid car littl portug bi...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object


In [30]:
from gensim.parsing.preprocessing import preprocess_string
from sklearn import linear_model, feature_extraction, model_selection
from sentence_transformers import SentenceTransformer

texts = train_df["text"].apply(lambda x:' '.join(preprocess_string(x)))

clf = linear_model.RidgeClassifier()

# Count Vector
count_vectorizer = feature_extraction.text.CountVectorizer()
tr_prcssd_count_vec = count_vectorizer.fit_transform(texts)

scores_count_vec = model_selection.cross_val_score(
    clf, 
    tr_prcssd_count_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# Tf-Idf Vector
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()
tr_prcssd_tfidf_vec = tfidf_vectorizer.fit_transform(texts)

scores_tfidf_vec = model_selection.cross_val_score(
    clf, 
    tr_prcssd_tfidf_vec, 
    train_df["target"], 
    cv=3, 
    scoring="f1"
)

# sentence Transformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
tr_prcssd_bert_vec = model.encode(texts)

scores_bert_vec = model_selection.cross_val_score(
    clf,
    tr_prcssd_bert_vec,
    train_df['target'],
    cv=3,
    scoring='f1'
)

print(f"count vector scores")
for i, score in enumerate(scores_count_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_count_vec.mean():.4f}\n")

print(f"tf-idf vector scores")
for i, score in enumerate(scores_tfidf_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_tfidf_vec.mean():.4f}\n")

print(f"bert vector scores")
for i, score in enumerate(scores_bert_vec):print(f"{i}\t{score:.4f}")
print(f"score average: {scores_bert_vec.mean():.4f}")

count vector scores
0	0.5879
1	0.5564
2	0.6357
score average: 0.5933

tf-idf vector scores
0	0.5975
1	0.5757
2	0.6537
score average: 0.6090

bert vector scores
0	0.6790
1	0.6413
2	0.7266
score average: 0.6823


In [37]:
preprocess_documents(
    ["<i>Hel 9lo</i> <b>Wo9 rld</b>!", "Th3     Weather_is really g00d today, isn't it?"]
)

[['hel', 'rld'], ['weather', 'todai', 'isn']]

In [36]:
preprocess_string(
    "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3  fire    Weather_is really g00d today, isn't it?"
)

['hel', 'rld', 'weather', 'todai', 'isn']

In [43]:
from gensim.parsing.preprocessing import strip_tags, strip_punctuation
s = "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     Weather_is really g00d today, isn't it?"
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation]
preprocess_string(s, CUSTOM_FILTERS)

['hel',
 '9lo',
 'wo9',
 'rld',
 'th3',
 'weather',
 'is',
 'really',
 'g00d',
 'today',
 'isn',
 't',
 'it']