### Text classification using spacy and support vector machine


In [72]:
import numpy as np
import pandas as pd
import torch
import string
import statistics
import nltk
import spacy
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [45]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Configuration for training

In [46]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:    
    EMB_SIZE = 300    
    NUM_FOLDS = 5
    NUM_EPOCHS = 20        

DATA_PATH = "./data/"

Global seed set to 42


### Load the data

In [47]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


### Some EDA

In [48]:
df_train_pos = df_train[df_train.target == 1]
df_train_neg = df_train[df_train.target == 0]
print(f"No. of positive training examples = {len(df_train_pos)}")
print(f"No. of negative training examples = {len(df_train_neg)}")
train_keywords_unique = df_train.keyword.unique()
print(f"No. of unique keywords = {len(train_keywords_unique)}")
df_train_notnull_keywords = df_train[~df_train.keyword.isnull()]
print(f"No of train examples with keyword not null = {len(df_train_notnull_keywords)}")

No. of positive training examples = 3271
No. of negative training examples = 4342
No. of unique keywords = 222
No of train examples with keyword not null = 7552


In [49]:
train_tweet_vectors = None
test_tweet_vectors = None
nlp = spacy.load("en_core_web_lg")
with nlp.disable_pipes():
    train_tweet_vectors = np.array([nlp(row.text).vector for id, row in df_train.iterrows()])
    test_tweet_vectors = np.array([nlp(row.text).vector for id, row in df_test.iterrows()])

In [50]:
train_tweet_vectors.shape

(7613, 300)

In [51]:
train_targets = df_train["target"]
vec_mean = train_tweet_vectors.mean(axis=0)
vec_std = train_tweet_vectors.std(axis=0)
print(vec_mean.shape, vec_std.shape)

(300,) (300,)


### K Fold CV

In [52]:
# for a training and label data in form of numpy arrays, return a fold_index array whose elements
# represent the fold index. The length of this fold_index array is same as length of input dataset
# and the items for which fold_index array value == cv iteration count are to be used for validation 
# in the corresponding cross validation iteration with rest of the items ( for which fold_index 
# array value != cv iteration count ) being used for training (typical ration being 80:20)
def get_skf_index(num_folds, X, y):
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state = 42)
    train_fold_index = np.zeros(len(y))
    for fold, (train_index, val_index) in enumerate(skf.split(X=X, y=y)):
        train_fold_index[val_index] = [fold + 1] * len(val_index)
    return train_fold_index

k_folds = get_skf_index(num_folds=Config.NUM_FOLDS, X=train_tweet_vectors, y=train_targets)

### Model building starts from here

In [53]:
# from allennlp.modules.elmo import Elmo, batch_to_ids

# elmo = Elmo(
#     options_file="./.vector_cache/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
#     weight_file="./.vector_cache/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
#     num_output_representations=1
#     )

# sentences = [['First', 'sentence', '.'], ['Another', '.']]
# character_ids = batch_to_ids(sentences)

# embeddings = elmo(character_ids)    
# character_ids[0][1]
# embeddings['elmo_representations'][0].shape

### Get train and validation data for a fold

In [60]:
# Get the train and validation data loaders for a specific fold. 
# X: numpy array of input features
# y: numpy array of target labels
# fold: fold index for which to create data loaders                                     
# kfolds: Array that marks each of the data items as belonging to a specific fold
def get_fold_data(fold, kfolds, X, y):
    fold += 1                         
    train_X = X[kfolds != fold]        
    train_y = y[kfolds != fold]    
    val_X = X[kfolds == fold]
    val_y = y[kfolds == fold]    
    return train_X, train_y, val_X, val_y

In [94]:
from sklearn.metrics import f1_score, accuracy_score

def run_training(train_X, train_y, val_X, val_y, params):
    # Create the SVC model
    model = SVC(gamma='scale', C=params["C"], kernel=params["kernel"])
    # model = LogisticRegression(solver="liblinear", n_jobs=-1, max_iter=200)
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X.astype(np.float32))
    val_X_scaled = scaler.fit_transform(val_X.astype(np.float32))
    model.fit(train_X_scaled, train_y.ravel())
    val_y_pred = model.predict(val_X_scaled)
    return f1_score(val_y, val_y_pred), model

In [95]:
# Wrapper method to run training for hyperparameter optimization as in this case the function 
# to be optimized needs to return one float value
def hyperparam_tune_run(train_X, train_y, val_X, val_y, params):
    max_val_acc, _ = run_training(train_X, train_y, val_X, val_y, params)
    return max_val_acc

In [97]:
import optuna

train_X, train_y, val_X, val_y = get_fold_data(0, k_folds, train_tweet_vectors, train_targets)

def objective(trial):   
    kernel_list = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
    params = {        
        "C": trial.suggest_loguniform("C", 1e-3, 1),
        "kernel": "rbf"      
    }
    loss = hyperparam_tune_run(train_X, train_y, val_X, val_y, params)
    trial_num = trial.number
    print(f"val accuracy at end of trial {trial_num} execution = {loss}")
    print(f"trial {trial_num} params = {trial.params}")
    return loss

study = optuna.create_study(direction="maximize", study_name="DisasterModelTuning")    
study.optimize(objective, n_trials=20)
print("Best trial:")
print(study.best_params)

[32m[I 2022-01-11 22:28:21,569][0m A new study created in memory with name: DisasterModelTuning[0m
[32m[I 2022-01-11 22:28:32,744][0m Trial 0 finished with value: 0.757396449704142 and parameters: {'C': 0.2337254707085589}. Best is trial 0 with value: 0.757396449704142.[0m


val accuracy at end of trial 0 execution = 0.757396449704142
trial 0 params = {'C': 0.2337254707085589}


[32m[I 2022-01-11 22:28:43,159][0m Trial 1 finished with value: 0.7720164609053498 and parameters: {'C': 0.9173821277623223}. Best is trial 1 with value: 0.7720164609053498.[0m


val accuracy at end of trial 1 execution = 0.7720164609053498
trial 1 params = {'C': 0.9173821277623223}


[32m[I 2022-01-11 22:28:56,458][0m Trial 2 finished with value: 0.0 and parameters: {'C': 0.0047665396027541706}. Best is trial 1 with value: 0.7720164609053498.[0m


val accuracy at end of trial 2 execution = 0.0
trial 2 params = {'C': 0.0047665396027541706}


[32m[I 2022-01-11 22:29:04,820][0m Trial 3 finished with value: 0.7721943048576214 and parameters: {'C': 0.5034080505590898}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 3 execution = 0.7721943048576214
trial 3 params = {'C': 0.5034080505590898}


[32m[I 2022-01-11 22:29:17,592][0m Trial 4 finished with value: 0.669208770257388 and parameters: {'C': 0.01634180920054303}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 4 execution = 0.669208770257388
trial 4 params = {'C': 0.01634180920054303}


[32m[I 2022-01-11 22:29:26,216][0m Trial 5 finished with value: 0.758679085520745 and parameters: {'C': 0.20651472760774242}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 5 execution = 0.758679085520745
trial 5 params = {'C': 0.20651472760774242}


[32m[I 2022-01-11 22:29:34,355][0m Trial 6 finished with value: 0.7703952901597982 and parameters: {'C': 0.4285766069027228}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 6 execution = 0.7703952901597982
trial 6 params = {'C': 0.4285766069027228}


[32m[I 2022-01-11 22:29:45,256][0m Trial 7 finished with value: 0.734375 and parameters: {'C': 0.07144188631357269}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 7 execution = 0.734375
trial 7 params = {'C': 0.07144188631357269}


[32m[I 2022-01-11 22:29:58,969][0m Trial 8 finished with value: 0.009118541033434652 and parameters: {'C': 0.00737967471670447}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 8 execution = 0.009118541033434652
trial 8 params = {'C': 0.00737967471670447}


[32m[I 2022-01-11 22:30:11,251][0m Trial 9 finished with value: 0.6335650446871897 and parameters: {'C': 0.014310678314680612}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 9 execution = 0.6335650446871897
trial 9 params = {'C': 0.014310678314680612}


[32m[I 2022-01-11 22:30:23,885][0m Trial 10 finished with value: 0.0 and parameters: {'C': 0.0013430345203368644}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 10 execution = 0.0
trial 10 params = {'C': 0.0013430345203368644}


[32m[I 2022-01-11 22:30:32,009][0m Trial 11 finished with value: 0.7720164609053498 and parameters: {'C': 0.8935519349710016}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 11 execution = 0.7720164609053498
trial 11 params = {'C': 0.8935519349710016}


[32m[I 2022-01-11 22:30:40,796][0m Trial 12 finished with value: 0.7538461538461538 and parameters: {'C': 0.11807954138264164}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 12 execution = 0.7538461538461538
trial 12 params = {'C': 0.11807954138264164}


[32m[I 2022-01-11 22:30:48,406][0m Trial 13 finished with value: 0.7713815789473685 and parameters: {'C': 0.9234546662253692}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 13 execution = 0.7713815789473685
trial 13 params = {'C': 0.9234546662253692}


[32m[I 2022-01-11 22:30:57,431][0m Trial 14 finished with value: 0.7294938917975567 and parameters: {'C': 0.05700734431694458}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 14 execution = 0.7294938917975567
trial 14 params = {'C': 0.05700734431694458}


[32m[I 2022-01-11 22:31:05,330][0m Trial 15 finished with value: 0.7645569620253164 and parameters: {'C': 0.33916816652095405}. Best is trial 3 with value: 0.7721943048576214.[0m


val accuracy at end of trial 15 execution = 0.7645569620253164
trial 15 params = {'C': 0.33916816652095405}


[32m[I 2022-01-11 22:31:14,028][0m Trial 16 finished with value: 0.7737704918032786 and parameters: {'C': 0.9862963148333224}. Best is trial 16 with value: 0.7737704918032786.[0m


val accuracy at end of trial 16 execution = 0.7737704918032786
trial 16 params = {'C': 0.9862963148333224}


[32m[I 2022-01-11 22:31:21,577][0m Trial 17 finished with value: 0.7720773759461733 and parameters: {'C': 0.4523207690834975}. Best is trial 16 with value: 0.7737704918032786.[0m


val accuracy at end of trial 17 execution = 0.7720773759461733
trial 17 params = {'C': 0.4523207690834975}


[32m[I 2022-01-11 22:31:30,258][0m Trial 18 finished with value: 0.7443868739205527 and parameters: {'C': 0.0970245401646819}. Best is trial 16 with value: 0.7737704918032786.[0m


val accuracy at end of trial 18 execution = 0.7443868739205527
trial 18 params = {'C': 0.0970245401646819}


[32m[I 2022-01-11 22:31:42,710][0m Trial 19 finished with value: 0.7121076233183856 and parameters: {'C': 0.03199382878922463}. Best is trial 16 with value: 0.7737704918032786.[0m


val accuracy at end of trial 19 execution = 0.7121076233183856
trial 19 params = {'C': 0.03199382878922463}
Best trial:
{'C': 0.9862963148333224}


In [96]:
fold_metrics_model = []
params = {"C": 1.0, "kernel": "rbf"}
for fold in range(Config.NUM_FOLDS):
    train_tweet_vectors = train_tweet_vectors - vec_mean
    train_X, train_y, val_X, val_y = get_fold_data(fold, k_folds, train_tweet_vectors, train_targets)    
    fold_val_metric, fold_model = run_training(train_X, train_y, val_X, val_y, params)
    fold_metrics_model.append((fold_val_metric, fold_model))

fold_metrics = [item[0] for item in fold_metrics_model]
print(fold_metrics)

[0.7744052502050862, 0.7631133671742809, 0.7426655490360435, 0.7699999999999999, 0.7878289473684211]


In [85]:
fold_metrics_model_sorted = sorted(fold_metrics_model, key=lambda x:x[0], reverse=True)    

In [86]:
best_model = fold_metrics_model_sorted[0][1]
scaler = StandardScaler()
test_tweet_vectors_scaled = scaler.fit_transform(test_tweet_vectors.astype(np.float32))
predictions = best_model.predict(test_tweet_vectors_scaled)
print(f"Completed prediction for {len(predictions)} test rows")
df_submission = pd.read_csv(DATA_PATH + 'submission.csv')
df_submission['target']= predictions
df_submission.to_csv('submission_svc.csv',index=False)

Completed prediction for 3263 test rows
