# import packages

In [1]:
import xgboost as xgb
import transformers as ppb  # pytorch transformers
import torch.nn as nn
import torch
import time
import string
import seaborn as sns
import re
import pickle
import pandas as pd
import os
import numpy as np
import nltk
import matplotlib.pyplot as plt
import lightgbm as lgb
import emoji
import catboost as catboost
from wordcloud import WordCloud
from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from datetime import datetime

start_time = datetime.now()
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

# Data preprocessing

In [2]:
sc = pd.read_csv("Suicide_Detection.csv", index_col=0)

In [3]:
sc.head()

Unnamed: 0,text,class
2,Ex Wife Threatening SuicideRecently I left my ...,suicide
3,Am I weird I don't get affected by compliments...,non-suicide
4,Finally 2020 is almost over... So I can never ...,non-suicide
8,i need helpjust help me im crying so hard,suicide
9,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",suicide


In [4]:
sc['label'] = [1 if i == 'suicide' else 0 for i in sc['class']]

## Translate emoji to English discription

In [5]:
def emoji_change(str):
    result = emoji.demojize(str, delimiters='  ')
    return result


sc['text_deemoji'] = sc['text'].apply(emoji_change)
sc

Unnamed: 0,text,class,label,text_deemoji
2,Ex Wife Threatening SuicideRecently I left my ...,suicide,1,Ex Wife Threatening SuicideRecently I left my ...
3,Am I weird I don't get affected by compliments...,non-suicide,0,Am I weird I don't get affected by compliments...
4,Finally 2020 is almost over... So I can never ...,non-suicide,0,Finally 2020 is almost over... So I can never ...
8,i need helpjust help me im crying so hard,suicide,1,i need helpjust help me im crying so hard
9,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",suicide,1,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv..."
...,...,...,...,...
348103,If you don't like rock then your not going to ...,non-suicide,0,If you don't like rock then your not going to ...
348106,You how you can tell i have so many friends an...,non-suicide,0,You how you can tell i have so many friends an...
348107,pee probably tastes like salty teaüòèüí¶‚ÄºÔ∏è can som...,non-suicide,0,pee probably tastes like salty tea smirking_fa...
348108,The usual stuff you find hereI'm not posting t...,suicide,1,The usual stuff you find hereI'm not posting t...


## Remove stopwords

In [6]:
# Remove stop_words

stop_words = set(
    stopwords.words('english') +
    [".", '.', ",", ":", "''", "'s", "'", "``", "^", "(", ")", "-"])

# Removed the stopwords
stop_removed_list = []
for line in sc['text_deemoji']:
    tokens = nltk.word_tokenize(line)
    stopword_removed = [
        token for token in tokens if token.lower() not in stop_words
    ]
    removed_sent = (" ").join(stopword_removed)
    stop_removed_list.append(removed_sent)
# append stopwords to a new column
sc['Text'] = stop_removed_list

Unnamed: 0,text,class,label,text_deemoji,Text
2,Ex Wife Threatening SuicideRecently I left my ...,suicide,1,Ex Wife Threatening SuicideRecently I left my ...,Ex Wife Threatening SuicideRecently left wife ...
3,Am I weird I don't get affected by compliments...,non-suicide,0,Am I weird I don't get affected by compliments...,weird n't get affected compliments coming some...
4,Finally 2020 is almost over... So I can never ...,non-suicide,0,Finally 2020 is almost over... So I can never ...,Finally 2020 almost ... never hear 2020 bad ye...
8,i need helpjust help me im crying so hard,suicide,1,i need helpjust help me im crying so hard,need helpjust help im crying hard
9,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",suicide,1,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",‚Äô lostHello name Adam 16 ‚Äô struggling years ‚Äô ...
...,...,...,...,...,...
348103,If you don't like rock then your not going to ...,non-suicide,0,If you don't like rock then your not going to ...,n't like rock going get anything go https //mu...
348106,You how you can tell i have so many friends an...,non-suicide,0,You how you can tell i have so many friends an...,tell many friends lonely everything deprived ?...
348107,pee probably tastes like salty teaüòèüí¶‚ÄºÔ∏è can som...,non-suicide,0,pee probably tastes like salty tea smirking_fa...,pee probably tastes like salty tea smirking_fa...
348108,The usual stuff you find hereI'm not posting t...,suicide,1,The usual stuff you find hereI'm not posting t...,usual stuff find hereI 'm posting sympathy pit...


## Regex

In [7]:
# Regex


def regex_clean(comment, ree, new):
    line = re.sub(ree, new, comment, flags=re.IGNORECASE)
    return line


re1 = r"\_"
re2 = r"filler"
re3 = r"\b(?:work[\w]+|jobs?|career|intern(ship)?|position)\b"
re4 = r"\b(?:co(\-)?worker|interpersonal|managers?|boss|supervisor|colleague|employees?|staffs?|network)\b"
sc['new_Text'] = sc['Text'].apply(regex_clean, ree=re1, new=' ')
sc['new_Text'] = sc['new_Text'].apply(regex_clean, ree=re2, new=' ')
sc['new_Text'] = sc['new_Text'].apply(regex_clean, ree=re3, new=' _WORK_ ')
sc['new_Text'] = sc['new_Text'].apply(regex_clean,
                                      ree=re4,
                                      new=' _INTERPERSONAL_ ')

## Lemmatization

In [8]:
# Lemmatization
lemmatizer = WordNetLemmatizer()


# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(
        sentence.lower()))  # I add a lower() function here since it will
    #tuple of (token, wordnet_tag)                                   # do a bad job with capital letters
    wordnet_tagged = [(x[0], nltk_tag_to_wordnet_tag(x[1]))
                      for x in nltk_tagged]
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


sc['lemm_Text'] = sc['new_Text'].apply(lemmatize_sentence)

## Topic Modeling

In [31]:
news_corpus_df = sc
news_corpus_df['text'] = sc['lemm_text']

In [32]:
# Regex
import re


def regex_clean(comment, ree, new):
    line = re.sub(ree, new, comment, flags=re.IGNORECASE)
    return line


# Mainly for removing '_' in the English discription after translating emojis
re1 = r"\_"

# Remove noise words
re2 = r"fil(l|t)er"
re3 = r"\b(youtube)|(reddit)|(www)|(com)|(amp)|(webp)|(https)|(x200b)|(pjpg)|(format)|(png)|(auto)\b"

# Translate oral language to written language
re4 = r"\b(wan( )?na)\b"
re5 = r"\b(gon( )?na)\b"

news_corpus_df['text'] = news_corpus_df['text'].apply(regex_clean,
                                                      ree=re1,
                                                      new=' ')
news_corpus_df['text'] = news_corpus_df['text'].apply(regex_clean,
                                                      ree=re2,
                                                      new=' ')
news_corpus_df['text'] = news_corpus_df['text'].apply(regex_clean,
                                                      ree=re3,
                                                      new=' ')
news_corpus_df['text'] = news_corpus_df['text'].apply(regex_clean,
                                                      ree=re4,
                                                      new=' want ')
news_corpus_df['text'] = news_corpus_df['text'].apply(regex_clean,
                                                      ree=re5,
                                                      new=' go ')

### Bigram

In [42]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

news_vectorizer = TfidfVectorizer(ngram_range=(2, 2),
                                  min_df=0.01,
                                  max_df=0.4,
                                  stop_words="english")

X_news, news_terms = news_vectorizer.fit_transform(
    news_corpus_df.text), news_vectorizer.get_feature_names_out()

news_tf_idf = pd.DataFrame(X_news.toarray(), columns=news_terms)
print(f"News TF-IDF: {news_tf_idf.shape}")
print(news_tf_idf.head(5))

News TF-IDF: (232074, 45)
   anymore want  best friend  ca anymore  dont know  dont want  end life  \
0           0.0          0.0         0.0        0.0        0.0       0.0   
1           0.0          0.0         0.0        0.0        0.0       0.0   
2           0.0          0.0         0.0        0.0        0.0       0.0   
3           0.0          0.0         0.0        0.0        0.0       0.0   
4           0.0          0.0         0.0        0.0        0.0       0.0   

   family friends  feel bad  feel better  feel like  ...  want die  want end  \
0             0.0       0.0          0.0        0.0  ...       0.0       0.0   
1             0.0       0.0          0.0        0.0  ...       0.0       0.0   
2             0.0       0.0          0.0        0.0  ...       0.0       0.0   
3             0.0       0.0          0.0        0.0  ...       0.0       0.0   
4             0.0       0.0          0.0        0.0  ...       0.0       0.0   

   want feel  want kill  want know  

In [45]:
nmf = NMF(n_components=2)
W_news = nmf.fit_transform(X_news)
H_news = nmf.components_
print(f"Original shape of X news is {X_news.shape}")
print(f"Decomposed W news matrix is {W_news.shape}")
print(f"Decomposed H news matrix is {H_news.shape}")
from typing import List
import numpy as np


def get_top_tf_idf_tokens_for_topic(H: np.array,
                                    feature_names: List[str],
                                    num_top_tokens: int = 5):
    """
  Uses the H matrix (K components x M original features) to identify for each
  topic the most frequent tokens.
  """
    for topic, vector in enumerate(H):
        print(f"TOPIC {topic}\n")
        total = vector.sum()
        top_scores = vector.argsort()[::-1][:num_top_tokens]
        token_names = list(map(lambda idx: feature_names[idx], top_scores))
        strengths = list(map(lambda idx: vector[idx] / total, top_scores))

        for strength, token_name in zip(strengths, token_names):
            print(f"\b{token_name} ({round(strength * 100, 1)}%)\n")
        print(f"=" * 50)


print(f"Topics:\n\n")
get_top_tf_idf_tokens_for_topic(H_news, news_tf_idf.columns.tolist(), 10)



Original shape of X news is (232074, 45)
Decomposed W news matrix is (232074, 2)
Decomposed H news matrix is (2, 45)
Topics:


TOPIC 0

feel like (51.2%)

feels like (2.8%)

high school (2.0%)

felt like (1.9%)

makes feel (1.9%)

suicidal thoughts (1.8%)

like shit (1.7%)

years ago (1.5%)

best friend (1.4%)

long time (1.4%)

TOPIC 1

want die (38.0%)

really want (4.9%)

want kill (3.3%)

want live (3.3%)

feels like (2.2%)

suicidal thoughts (2.2%)

dont want (2.2%)

want end (2.2%)

high school (2.1%)

years ago (1.8%)



In [46]:
import numpy as np


def get_top_documents_for_each_topic(W: np.array,
                                     documents: List[str],
                                     num_docs: int = 5):
    sorted_docs = W.argsort(axis=0)[::-1]
    top_docs = sorted_docs[:num_docs].T
    per_document_totals = W.sum(axis=1)
    for topic, top_documents_for_topic in enumerate(top_docs):
        print(f"Topic {topic}")
        for doc in top_documents_for_topic:
            score = W[doc][topic]
            percent_about_topic = round(score / per_document_totals[doc] * 100,
                                        1)
            print(f"{percent_about_topic}%", documents[doc])
        print("=" * 50)

In [47]:
get_top_documents_for_each_topic(W_news,
                                 news_corpus_df.text.tolist(),
                                 num_docs=5)

Topic 0
100.0% 'm 16 feel like girlfriend might pregnant . Please help me.I ca n't handle father 16 , 'll ruin life feel like one way . 've using protection period almost 3 weeks late . Edit : 'm reading replies appreciate , thank . 're helping .
100.0% Spent day tearsThis way live need . n't feel like anything live .
100.0% Idk ‚Äô even writing thisI need say . ‚Äô feeling low long feel like stop . ‚Äô scared , course ‚Äô stand anymore . many people many worse problems ‚Äô tried hard accept life live seriously . head eternal fog , ‚Äô find motivation anything every single day . ‚Äô burden ‚Äô making everyone around unhappy ‚Äô really ‚Äô stand . want stop . ‚Äô  e give advice looked today overwhelmed . snapped . ‚Äô first attempt know regretted ‚Äô convince ‚Äô happy anymore ‚Äô . , ‚Äô made deal , 28 days nothing gets better ‚Äô . ‚Äô  forting strangest way . guess ‚Äô knowing things better either way . ‚Äô even know ‚Äô writing honestly guess need someone listen maybe . ‚Äô tired .


### Trigram

In [48]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

news_vectorizer = TfidfVectorizer(ngram_range=(3, 3),
                                  min_df=10,
                                  max_df=0.4,
                                  stop_words="english")

X_news, news_terms = news_vectorizer.fit_transform(
    news_corpus_df.text), news_vectorizer.get_feature_names_out()

news_tf_idf = pd.DataFrame(X_news.toarray(), columns=news_terms)
print(f"News TF-IDF: {news_tf_idf.shape}")
print(news_tf_idf.head(5))

News TF-IDF: (232074, 28020)
   000 000 000  000 000 billion  000 000 million  000 000 quadrillion  \
0          0.0              0.0              0.0                  0.0   
1          0.0              0.0              0.0                  0.0   
2          0.0              0.0              0.0                  0.0   
3          0.0              0.0              0.0                  0.0   
4          0.0              0.0              0.0                  0.0   

   000 000 trillion  000 billion day  000 miles away  000 million day  \
0               0.0              0.0             0.0              0.0   
1               0.0              0.0             0.0              0.0   
2               0.0              0.0             0.0              0.0   
3               0.0              0.0             0.0              0.0   
4               0.0              0.0             0.0              0.0   

   000 quadrillion day  000 student loans  ...  yr old female  yr old girl  \
0              

In [49]:
W_news = nmf.fit_transform(X_news)
H_news = nmf.components_
print(f"Original shape of X news is {X_news.shape}")
print(f"Decomposed W news matrix is {W_news.shape}")
print(f"Decomposed H news matrix is {H_news.shape}")



Original shape of X news is (232074, 28020)
Decomposed W news matrix is (232074, 2)
Decomposed H news matrix is (2, 28020)


In [50]:
from typing import List
import numpy as np


def get_top_tf_idf_tokens_for_topic(H: np.array,
                                    feature_names: List[str],
                                    num_top_tokens: int = 5):
    """
  Uses the H matrix (K components x M original features) to identify for each
  topic the most frequent tokens.
  """
    for topic, vector in enumerate(H):
        print(f"TOPIC {topic}\n")
        total = vector.sum()
        top_scores = vector.argsort()[::-1][:num_top_tokens]
        token_names = list(map(lambda idx: feature_names[idx], top_scores))
        strengths = list(map(lambda idx: vector[idx] / total, top_scores))

        for strength, token_name in zip(strengths, token_names):
            print(f"\b{token_name} ({round(strength * 100, 1)}%)\n")
        print(f"=" * 50)


print(f"Topics:\n\n")
get_top_tf_idf_tokens_for_topic(H_news, news_tf_idf.columns.tolist(), 10)

Topics:


TOPIC 0

smiling face sunglasses (57.3%)

sunglasses smiling face (7.7%)

face sunglasses smiling (7.7%)

cool smiling face (1.4%)

face tears joy (0.7%)

face smiling face (0.6%)

face steam nose (0.6%)

backhand index pointing (0.6%)

face sunglasses thumbs (0.5%)

shit smiling face (0.4%)

TOPIC 1

loudly crying face (42.3%)

face loudly crying (11.5%)

crying face loudly (10.6%)

face tears joy (0.8%)

face pensive face (0.5%)

face rolling eyes (0.5%)

face water pistol (0.4%)

crying face want (0.4%)

face steam nose (0.4%)

pensive face pensive (0.4%)



In [40]:
get_top_documents_for_each_topic(W_news,
                                 news_corpus_df.text.tolist(),
                                 num_docs=5)

Topic 0
100.0% Ive failed simp September Ive failed gf thats nice think smiling face with sunglasses moai moai smiling face with sunglasses moai smiling face with sunglasses smiling face with sunglasses moai smiling face with sunglasses
100.0% Guys got PTSD pensive face got Potential suck dick fire smiling face with sunglasses hundred points hundred points
100.0% u didnt buy bobux ur  go  die didnt buy bobux guys hold dont gf buy bobux smiling face with sunglasses
100.0% heard ‚Äô smart guys work ijk plane daily smiling face with sunglasses
100.0% might be e male stripper Imagine , get thousand money EVERY NIGHT . social circle pretty big , get lots sex thots , stripping shirt smiling face with sunglasses
Topic 1
100.0% total bruh moment loudly crying face grandpa uses voice text thing send messages mom saying something like ‚Äú hate pictures ‚Äù phone caught ‚Äú sounds good ‚Äù , ‚Äú hate ‚Äù . text got sent best friend sent back ‚Äú hate , wtf ‚Äù loudly crying face loudly crying fac

# Model Exploration - Logistic (Baseline), Random Forest, LightGBM, XGBoost, CatBoost

## Bert Tokenizer

In [10]:
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer,
                                                    'bert-base-uncased')

In [11]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights,
                                            do_lower_case=True)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokenized = sc['lemm_Text'].apply((lambda x: tokenizer.encode(
    x, add_special_tokens=True, padding=True, truncation=True)))

In [14]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])

In [15]:
feature_data = pd.DataFrame(np.array(padded))

In [16]:
feature_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,101,4654,2564,15686,5920,2890,13013,2135,2681,2564,...,0,0,0,0,0,0,0,0,0,0
1,101,6881,1050,1005,1056,2131,5360,19394,2272,2619,...,0,0,0,0,0,0,0,0,0,0
2,101,2633,12609,2471,1012,1012,1012,2196,2963,12609,...,0,0,0,0,0,0,0,0,0,0
3,101,2342,2393,29427,2393,10047,5390,2524,102,0,...,0,0,0,0,0,0,0,0,0,0
4,101,1521,2439,18223,2080,2171,4205,2385,1521,5998,...,0,0,0,0,0,0,0,0,0,0


In [17]:
scsc = sc.reset_index()

In [19]:
model_data = feature_data.join(sc)
model_data.drop([
    'text_deemoji', 'text', 'class', 'Text', 'lemm_Text', 'new_Text', 'index'
],
                axis=1,
                inplace=True)

## Train/Test Sets Split

In [22]:
X = pd.DataFrame(model_data.loc[:, [i for i in range(0, 512)]])
Y = pd.DataFrame(model_data.loc[:, 'label'])

## Logistic (Baseline) - Accuracy 71.8% (7.21s)

In [29]:
%%time
X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=.3)

model = LogisticRegression()
model.fit(X_trn, Y_trn)
y_pred = model.predict(X_tst)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_tst, y_pred))
print(accuracy_score(Y_tst, y_pred))

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[27185  7687]
 [11968 22783]]
0.7176938655329417
CPU times: user 26 s, sys: 3.96 s, total: 30 s
Wall time: 7.21 s


## Random Forest - Accuracy 77.1% (1 min 31 s)

In [32]:
%%time
#RF

X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=.3)

model = RandomForestClassifier()
model.fit(X_trn, Y_trn)
y_pred = model.predict(X_tst)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_tst, y_pred))
print(accuracy_score(Y_tst, y_pred))



[[27420  7433]
 [ 8531 26239]]
0.7707079557042931
CPU times: user 1min 28s, sys: 1.55 s, total: 1min 29s
Wall time: 1min 31s


## LightGBM - Accuracy 82.2% (8.45s)

In [33]:
%%time
#LGBM

X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=.3)

model = lgb.LGBMClassifier()
model.fit(X_trn, Y_trn)
y_pred = model.predict(X_tst)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_tst, y_pred))
print(accuracy_score(Y_tst, y_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[29223  5557]
 [ 6813 28030]]
0.8223288281171452
CPU times: user 41.3 s, sys: 4.62 s, total: 45.9 s
Wall time: 8.45 s


## XGBoost - Accuracy 85.7% (1 min 40 s)

In [34]:
%%time
#XGB

X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=.3)

model = xgb.XGBClassifier()
model.fit(X_trn, Y_trn)
y_pred = model.predict(X_tst)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_tst, y_pred))
print(accuracy_score(Y_tst, y_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[30243  4361]
 [ 5569 29450]]
0.8573747181247576
CPU times: user 11min 30s, sys: 27.1 s, total: 11min 57s
Wall time: 1min 40s


## CatBoost - Accuracy 85.7% (1 min 45 s)

In [35]:
%%time
#CatB
X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=.3)

model = cat.CatBoostClassifier()
model.fit(X_trn, Y_trn)
y_pred = model.predict(X_tst)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_tst, y_pred))
print(accuracy_score(Y_tst, y_pred))

Learning rate set to 0.090555
0:	learn: 0.6660412	total: 201ms	remaining: 3m 21s
1:	learn: 0.6440649	total: 319ms	remaining: 2m 39s
2:	learn: 0.6255121	total: 449ms	remaining: 2m 29s
3:	learn: 0.6109721	total: 574ms	remaining: 2m 22s
4:	learn: 0.5981821	total: 702ms	remaining: 2m 19s
5:	learn: 0.5860336	total: 822ms	remaining: 2m 16s
6:	learn: 0.5771437	total: 948ms	remaining: 2m 14s
7:	learn: 0.5698121	total: 1.06s	remaining: 2m 11s
8:	learn: 0.5626728	total: 1.18s	remaining: 2m 10s
9:	learn: 0.5575743	total: 1.3s	remaining: 2m 9s
10:	learn: 0.5526531	total: 1.42s	remaining: 2m 8s
11:	learn: 0.5457297	total: 1.54s	remaining: 2m 6s
12:	learn: 0.5416407	total: 1.65s	remaining: 2m 5s
13:	learn: 0.5376438	total: 1.76s	remaining: 2m 3s
14:	learn: 0.5344963	total: 1.89s	remaining: 2m 3s
15:	learn: 0.5318715	total: 2.01s	remaining: 2m 3s
16:	learn: 0.5286555	total: 2.14s	remaining: 2m 3s
17:	learn: 0.5265897	total: 2.27s	remaining: 2m 3s
18:	learn: 0.5247753	total: 2.39s	remaining: 2m 3s
19:

157:	learn: 0.4321440	total: 16.6s	remaining: 1m 28s
158:	learn: 0.4318991	total: 16.7s	remaining: 1m 28s
159:	learn: 0.4311032	total: 16.8s	remaining: 1m 28s
160:	learn: 0.4305187	total: 16.9s	remaining: 1m 28s
161:	learn: 0.4303383	total: 17s	remaining: 1m 28s
162:	learn: 0.4295332	total: 17.1s	remaining: 1m 27s
163:	learn: 0.4292957	total: 17.2s	remaining: 1m 27s
164:	learn: 0.4287811	total: 17.4s	remaining: 1m 27s
165:	learn: 0.4284887	total: 17.5s	remaining: 1m 28s
166:	learn: 0.4282356	total: 17.6s	remaining: 1m 27s
167:	learn: 0.4280464	total: 17.8s	remaining: 1m 27s
168:	learn: 0.4278839	total: 17.8s	remaining: 1m 27s
169:	learn: 0.4274566	total: 17.9s	remaining: 1m 27s
170:	learn: 0.4270726	total: 18.1s	remaining: 1m 27s
171:	learn: 0.4268577	total: 18.3s	remaining: 1m 27s
172:	learn: 0.4266816	total: 18.4s	remaining: 1m 27s
173:	learn: 0.4265225	total: 18.5s	remaining: 1m 27s
174:	learn: 0.4262924	total: 18.6s	remaining: 1m 27s
175:	learn: 0.4255487	total: 18.8s	remaining: 1m

314:	learn: 0.3873285	total: 33.7s	remaining: 1m 13s
315:	learn: 0.3871595	total: 33.8s	remaining: 1m 13s
316:	learn: 0.3865046	total: 34s	remaining: 1m 13s
317:	learn: 0.3863225	total: 34.1s	remaining: 1m 13s
318:	learn: 0.3859425	total: 34.2s	remaining: 1m 12s
319:	learn: 0.3858763	total: 34.3s	remaining: 1m 12s
320:	learn: 0.3857528	total: 34.4s	remaining: 1m 12s
321:	learn: 0.3856309	total: 34.5s	remaining: 1m 12s
322:	learn: 0.3853929	total: 34.6s	remaining: 1m 12s
323:	learn: 0.3852002	total: 34.7s	remaining: 1m 12s
324:	learn: 0.3850875	total: 34.8s	remaining: 1m 12s
325:	learn: 0.3850164	total: 34.8s	remaining: 1m 12s
326:	learn: 0.3846971	total: 35s	remaining: 1m 12s
327:	learn: 0.3845840	total: 35.1s	remaining: 1m 11s
328:	learn: 0.3840185	total: 35.2s	remaining: 1m 11s
329:	learn: 0.3839236	total: 35.2s	remaining: 1m 11s
330:	learn: 0.3838254	total: 35.3s	remaining: 1m 11s
331:	learn: 0.3837180	total: 35.4s	remaining: 1m 11s
332:	learn: 0.3835693	total: 35.5s	remaining: 1m 1

473:	learn: 0.3607653	total: 50.1s	remaining: 55.6s
474:	learn: 0.3606709	total: 50.2s	remaining: 55.5s
475:	learn: 0.3605053	total: 50.4s	remaining: 55.5s
476:	learn: 0.3603575	total: 50.5s	remaining: 55.3s
477:	learn: 0.3601808	total: 50.6s	remaining: 55.3s
478:	learn: 0.3599198	total: 50.7s	remaining: 55.2s
479:	learn: 0.3597858	total: 50.8s	remaining: 55s
480:	learn: 0.3596581	total: 51s	remaining: 55s
481:	learn: 0.3594426	total: 51.1s	remaining: 54.9s
482:	learn: 0.3593531	total: 51.2s	remaining: 54.8s
483:	learn: 0.3592801	total: 51.2s	remaining: 54.6s
484:	learn: 0.3591923	total: 51.3s	remaining: 54.5s
485:	learn: 0.3591287	total: 51.4s	remaining: 54.4s
486:	learn: 0.3589143	total: 51.6s	remaining: 54.3s
487:	learn: 0.3587460	total: 51.7s	remaining: 54.2s
488:	learn: 0.3586785	total: 51.8s	remaining: 54.1s
489:	learn: 0.3585609	total: 51.9s	remaining: 54s
490:	learn: 0.3584633	total: 52s	remaining: 53.9s
491:	learn: 0.3583221	total: 52.1s	remaining: 53.8s
492:	learn: 0.3582287	

633:	learn: 0.3418191	total: 1m 6s	remaining: 38.6s
634:	learn: 0.3417493	total: 1m 6s	remaining: 38.5s
635:	learn: 0.3416673	total: 1m 7s	remaining: 38.4s
636:	learn: 0.3415754	total: 1m 7s	remaining: 38.3s
637:	learn: 0.3414057	total: 1m 7s	remaining: 38.2s
638:	learn: 0.3412816	total: 1m 7s	remaining: 38s
639:	learn: 0.3411607	total: 1m 7s	remaining: 37.9s
640:	learn: 0.3410383	total: 1m 7s	remaining: 37.8s
641:	learn: 0.3409430	total: 1m 7s	remaining: 37.7s
642:	learn: 0.3408939	total: 1m 7s	remaining: 37.6s
643:	learn: 0.3408406	total: 1m 7s	remaining: 37.5s
644:	learn: 0.3407563	total: 1m 7s	remaining: 37.4s
645:	learn: 0.3406704	total: 1m 8s	remaining: 37.3s
646:	learn: 0.3405993	total: 1m 8s	remaining: 37.2s
647:	learn: 0.3403150	total: 1m 8s	remaining: 37.1s
648:	learn: 0.3402657	total: 1m 8s	remaining: 37s
649:	learn: 0.3401971	total: 1m 8s	remaining: 36.8s
650:	learn: 0.3401349	total: 1m 8s	remaining: 36.7s
651:	learn: 0.3400598	total: 1m 8s	remaining: 36.6s
652:	learn: 0.33

790:	learn: 0.3268288	total: 1m 22s	remaining: 21.9s
791:	learn: 0.3266282	total: 1m 22s	remaining: 21.8s
792:	learn: 0.3265380	total: 1m 23s	remaining: 21.7s
793:	learn: 0.3264482	total: 1m 23s	remaining: 21.6s
794:	learn: 0.3264441	total: 1m 23s	remaining: 21.5s
795:	learn: 0.3263487	total: 1m 23s	remaining: 21.4s
796:	learn: 0.3263066	total: 1m 23s	remaining: 21.3s
797:	learn: 0.3262621	total: 1m 23s	remaining: 21.1s
798:	learn: 0.3261644	total: 1m 23s	remaining: 21s
799:	learn: 0.3261113	total: 1m 23s	remaining: 20.9s
800:	learn: 0.3259943	total: 1m 23s	remaining: 20.8s
801:	learn: 0.3258240	total: 1m 23s	remaining: 20.7s
802:	learn: 0.3256917	total: 1m 24s	remaining: 20.6s
803:	learn: 0.3255815	total: 1m 24s	remaining: 20.5s
804:	learn: 0.3255212	total: 1m 24s	remaining: 20.4s
805:	learn: 0.3254663	total: 1m 24s	remaining: 20.3s
806:	learn: 0.3254165	total: 1m 24s	remaining: 20.2s
807:	learn: 0.3252063	total: 1m 24s	remaining: 20.1s
808:	learn: 0.3251159	total: 1m 24s	remaining: 2

948:	learn: 0.3155834	total: 1m 38s	remaining: 5.32s
949:	learn: 0.3155316	total: 1m 39s	remaining: 5.21s
950:	learn: 0.3154673	total: 1m 39s	remaining: 5.11s
951:	learn: 0.3153995	total: 1m 39s	remaining: 5s
952:	learn: 0.3153477	total: 1m 39s	remaining: 4.9s
953:	learn: 0.3152786	total: 1m 39s	remaining: 4.79s
954:	learn: 0.3152272	total: 1m 39s	remaining: 4.69s
955:	learn: 0.3151753	total: 1m 39s	remaining: 4.58s
956:	learn: 0.3151023	total: 1m 39s	remaining: 4.48s
957:	learn: 0.3150467	total: 1m 39s	remaining: 4.38s
958:	learn: 0.3149225	total: 1m 39s	remaining: 4.27s
959:	learn: 0.3148780	total: 1m 40s	remaining: 4.17s
960:	learn: 0.3147739	total: 1m 40s	remaining: 4.06s
961:	learn: 0.3146975	total: 1m 40s	remaining: 3.96s
962:	learn: 0.3146255	total: 1m 40s	remaining: 3.85s
963:	learn: 0.3145777	total: 1m 40s	remaining: 3.75s
964:	learn: 0.3145132	total: 1m 40s	remaining: 3.65s
965:	learn: 0.3144508	total: 1m 40s	remaining: 3.54s
966:	learn: 0.3143916	total: 1m 40s	remaining: 3.4

# Model Exploration - Bert

## Check GPU

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Train/Validation Sets Split

In [19]:
df = sc
train_df, val_df = train_test_split(df,
                                    test_size=0.3,
                                    stratify=df.label,
                                    random_state=42)

## Define BertTokenizer and Data loader (Set truncation for long text)

In [35]:
class MNLIDataBert(Dataset):
    def __init__(self, train_df, val_df):
        self.label_dict = {0: 0, 1: 1}

        self.train_df = train_df
        self.val_df = val_df

        self.base_path = '/content/'
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True,
                                                       truncation=True)
        self.train_data = None
        self.val_data = None
        self.init_data()

    def init_data(self):
        # Saving takes too much RAM
        #
        # if os.path.exists(os.path.join(self.base_path, 'train_data.pkl')):
        #   print("Found training data")
        #   with open(os.path.join(self.base_path, 'train_data.pkl'), 'rb') as f:
        #     self.train_data = pickle.load(f)
        # else:
        #   self.train_data = self.load_data(self.train_df)
        #   with open(os.path.join(self.base_path, 'train_data.pkl'), 'wb') as f:
        #     pickle.dump(self.train_data, f)
        # if os.path.exists(os.path.join(self.base_path, 'val_data.pkl')):
        #   print("Found val data")
        #   with open(os.path.join(self.base_path, 'val_data.pkl'), 'rb') as f:
        #     self.val_data = pickle.load(f)
        # else:
        #   self.val_data = self.load_data(self.val_df)
        #   with open(os.path.join(self.base_path, 'val_data.pkl'), 'wb') as f:
        #     pickle.dump(self.val_data, f)
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)

    def load_data(self, df):
        MAX_LEN = 50
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['lemm_Text'].to_list()
        label_list = df['label'].to_list()

        for (premise, label) in zip(premise_list, label_list):
            premise_id = self.tokenizer.encode(premise,
                                               add_special_tokens=True,
                                               truncation=True,
                                               max_length=50)
            pair_token_ids = [self.tokenizer.cls_token_id
                              ] + premise_id + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)

            segment_ids = torch.tensor(
                [0] * (premise_len + 2))  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor(
                [1] * (premise_len + 2))  # mask padded values

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            y.append(self.label_dict[label])

        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print((len(dataset)))
        return dataset

    def get_data_loaders(self, batch_size=8, shuffle=True):
        train_loader = DataLoader(self.train_data,
                                  shuffle=shuffle,
                                  batch_size=batch_size)

        val_loader = DataLoader(self.val_data,
                                shuffle=shuffle,
                                batch_size=batch_size)

        return train_loader, val_loader

In [36]:
mnli_dataset = MNLIDataBert(train_df, val_df)

162447
69621


In [37]:
train_loader, val_loader = mnli_dataset.get_data_loaders(batch_size=32)

## Load Bert from pretrained

In [38]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Parameters

In [39]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.0
}]

## Optimizer - AdamW

In [40]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)



## Number of Trainable Parameters - 109,483,778

In [41]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,483,778 trainable parameters


## Define train and evaluation functions

In [42]:
def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1)
           == y_test).sum().float() / float(y_test.size(0))
    return acc

In [43]:
# criterion = nn.CrossEntropyLoss().to(device)
EPOCHS = 3


def train(model, train_loader, val_loader, optimizer):
    total_step = len(train_loader)

    for epoch in range(EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc = 0
        for batch_idx, (pair_token_ids, mask_ids, seg_ids,
                        y) in enumerate(train_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            # prediction = model(pair_token_ids, mask_ids, seg_ids)
            loss, prediction = list(
                model(pair_token_ids,
                      token_type_ids=seg_ids,
                      attention_mask=mask_ids,
                      labels=labels).values())

            # loss = criterion(prediction, labels.view(-1, 1))
            acc = multi_acc(prediction, labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

        train_acc = total_train_acc / len(train_loader)
        train_loss = total_train_loss / len(train_loader)
        model.eval()
        total_val_acc = 0
        total_val_loss = 0
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids,
                            y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                # prediction = model(pair_token_ids, mask_ids, seg_ids)
                loss, prediction = list(
                    model(pair_token_ids,
                          token_type_ids=seg_ids,
                          attention_mask=mask_ids,
                          labels=labels).values())

                # loss = criterion(prediction, labels.view(-1, 1))
                acc = multi_acc(prediction, labels)

                total_val_loss += loss.item()
                total_val_acc += acc.item()

        val_acc = total_val_acc / len(val_loader)
        val_loss = total_val_loss / len(val_loader)
        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print(
            f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}'
        )
        print(("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                               seconds)))

## Result - Accuracy 97.4% (28 min 23 s)

In [44]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.1015 train_acc: 0.9620 | val_loss: 0.0754 val_acc: 0.9740
00:28:23.19
Epoch 2: train_loss: 0.0511 train_acc: 0.9817 | val_loss: 0.0768 val_acc: 0.9724
00:28:22.07
Epoch 3: train_loss: 0.0274 train_acc: 0.9906 | val_loss: 0.0907 val_acc: 0.9713
00:28:22.96


# Code Resource (Bert)

https://github.com/dh1105/Sentence-Entailment/blob/main/Sentence_Entailment_BERT.ipynb