In [71]:
import pandas as pd
import spacy
from spacy.lang.en.examples import sentences 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

In [24]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.shape

(7613, 5)

In [5]:
df_train.isna().value_counts()

id     keyword  location  text   target
False  False    False     False  False     5080
                True      False  False     2472
       True     True      False  False       61
Name: count, dtype: int64

# Preprocess the text

In [6]:
nlp = spacy.load("en_core_web_sm")

In [54]:
def preprocess(text):

    text = re.sub(r'http\S+|www\S+|https\S+|[^a-zA-Z0-9\s]', '', text) #remove urls and special characters
    text = re.sub(r'\s+', ' ', text) #remove extra spaces


    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        if token.is_punct or token.is_stop: #remove punctuation and stop words
            continue
        else:
            filtered_tokens.append(token.lemma_.lower()) #create a list with the tokens lemmatized and in lowercase

    return " ".join(filtered_tokens) #return a processed sentence

In [None]:
# Dropping the column location as it is not present in many cases, it shouldn't add any value to the result and it does not accurate in many cases
#df_train.drop(columns=['location'], inplace=True) 

In [55]:
df_train['processed_text'] = df_train['text'].apply(preprocess)

In [56]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get send photo ruby alaska smoke wildfire pour...


# Create and test the Models

## Find the best parameters

In [None]:
# Split the training set so I can test my model and improve it before making predictions on the test data

X_train, X_test, y_train, y_test = train_test_split(
    df_train['processed_text'],
    df_train['target'],
    test_size=0.2,
    random_state=42,
    stratify=df_train['target']
)

In [None]:
# Create a pipeline with the vectorizer and the model

model_rfc = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

In [None]:
# Use gridsearch to tune the hyperparameters for the RandomForestClassifier Model
param_grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__min_df': [1, 2, 3],
    'classifier__n_estimators': [100, 200, 500],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', None],
}

# I tried with GridSearchCV but it was taking several hours, so I changed it to the RandomizeSearchCV so I could move forward
grid_search = RandomizedSearchCV(model_rfc, param_distributions=param_grid, n_iter=100, cv=5, scoring='f1', n_jobs=-1, verbose=1, random_state=42)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [49]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 2, 'classifier__n_estimators': 500, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': None}
Best Score: 0.7194008241376485


## Create the model with the best parameters found

In [None]:
# Split the training set so I can test my model and improve it before making predictions on the test data

X_train, X_test, y_train, y_test = train_test_split(
    df_train['processed_text'],
    df_train['target'],
    test_size=0.2,
    random_state=42,
    stratify=df_train['target']
)

In [126]:
# Create a pipeline with the vectorizer and the model with the

model_rfc_final = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1), min_df=2)),
    ('classifier', RandomForestClassifier(n_estimators=500,min_samples_split=5, min_samples_leaf=1, max_features='log2', max_depth=None))
])

#model_rfc.fit(X_train, y_train) #Fit the model with training data

In [127]:
model_rfc_final.fit(X_train, y_train)

In [128]:
#make predicitons with the model
y_pred = model_rfc_final.predict(X_test)

In [129]:
y_pred[0]

np.int64(1)

In [130]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       869
           1       0.83      0.70      0.76       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523



## Create a Model with keywords included

In [131]:
# Split the training set so I can test my model and improve it before making predictions on the test data

X_train, X_test, y_train, y_test = train_test_split(
    df_train[['processed_text', 'keyword']],
    df_train['target'],
    test_size=0.2,
    random_state=42,
    stratify=df_train['target']
)

In [132]:
X_train['keyword'].fillna('', inplace=True)
X_test['keyword'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['keyword'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['keyword'].fillna('', inplace=True)


In [133]:
X_train['keyword'].isna().value_counts()

keyword
False    6090
Name: count, dtype: int64

In [None]:
# Create a pipeline to vectorizer the text and the keywords
columns_preprocessor = ColumnTransformer(
    transformers=[
        ('text_transformer', TfidfVectorizer(ngram_range=(1,1), min_df=2), 'processed_text'),
        ('keywords_transformer', CountVectorizer(), 'keyword')
    ]
)

In [None]:
# Create a pipeline with the vectorizers processros and the random forest classifier

model_rfc_keywords = Pipeline([
    ('column_processor', columns_preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=500,min_samples_split=5, min_samples_leaf=1, max_features='log2', max_depth=None))
])

In [136]:
model_rfc_keywords.fit(X_train, y_train)

In [137]:
y_pred = model_rfc_keywords.predict(X_test)

In [138]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       869
           1       0.83      0.69      0.75       654

    accuracy                           0.81      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.81      0.80      1523



# Using our best model

In [141]:
# Prepare all the training data to fit the model
df_train_best = df_train[['processed_text', 'keyword']]
df_train_best = df_train_best.fillna('')

In [None]:
# Create a pipeline to vectorizer the text and the keywords
columns_preprocessor = ColumnTransformer(
    transformers=[
        ('text_transformer', TfidfVectorizer(ngram_range=(1,1), min_df=2), 'processed_text'),
        ('keywords_transformer', CountVectorizer(), 'keyword')
    ]
)

In [None]:
# Create a pipeline with the vectorizers processros and the random forest classifier

model_best = Pipeline([
    ('column_processor', columns_preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=500,min_samples_split=5, min_samples_leaf=1, max_features='log2', max_depth=None))
])

In [146]:
# Fit the model with all training data

model_best.fit(df_train_best, df_train['target'])

In [149]:
def make_predictions(df, output='submission_file.csv'):

    ids = df['id'].values

    df['processed_text'] = df['text'].apply(preprocess) #preprocess the text
    df_final = df[['processed_text', 'keyword']].copy() #save a DF with only the relevant columns
    df_final = df_final.fillna('') #fill null values in order to avoid errors when feeding the model

    predictions = model_best.predict(df_final)

    df_predictions = pd.DataFrame({
        'id': ids,
        'target': predictions
    })

    df_predictions.to_csv(output, index=False)

    return df_predictions

In [150]:
make_predictions(df_test)

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
