In [1]:
import re
import string

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import f1_score

# Import data

In [2]:
train = pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Data Exploration

# Model creation

In [4]:
# Make text lowercase, remove text in square brackets + links + special characters + words containing numbers
def text_preprocessor(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [5]:
def text_transform(text_series):
    hash_vectorizer = HashingVectorizer(
        input="content",
        preprocessor=text_preprocessor,
        analyzer="word",
        n_features=10_000,
        norm="l2",
        alternate_sign=False,
    )
    hash_vectorizer.fit(text_series)
    text_vectors = hash_vectorizer.transform(text_series)

    return text_vectors

In [7]:
# (num_tokens, num_train_features)
train_vectors = text_transform(train["text"])
f"Number of tokens: {train_vectors.shape[0]}, Number of training features: {train_vectors.shape[1]}"

'Number of tokens: 7613, Number of training features: 10000'

In [8]:
model = LogisticRegression(C=1.0)

In [9]:
model.fit(
    train_vectors, 
    train["target"]
)

LogisticRegression()

# Model evaluation

In [10]:
# Preview model with cross validation on F1 score using 5 groups
scores = model_selection.cross_val_score(
    model, 
    train_vectors, 
    train["target"], 
    cv = 5, 
    scoring = "f1"
)
scores

array([0.62737987, 0.59804791, 0.63728814, 0.63273961, 0.71956009])

In [11]:
predictions = model.predict(train_vectors)
round(f1_score(train["target"], predictions), 3)

0.808

# Submission

In [12]:
submission = pd.read_csv("data/test.csv")
submission_vectors = text_transform(submission["text"])

In [13]:
submission["target"] = model.predict(submission_vectors)
submission.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [15]:
submission[["id", "target"]].to_csv("data/my_submission.csv", index = False)