In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFBertModel

In [None]:
# df = pd.read_csv("/content/drive/MyDrive/ml/twtSentiment.csv")
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/ml/train_depression.tsv", sep='\t')
df['Label'] = df['Label'].replace("not depression", 0)
df['Label'] = df['Label'].replace("moderate", 1)
df['Label'] = df['Label'].replace("severe", 2)
df.head(5)
# df2.head(5)

Mounted at /content/drive


Unnamed: 0,PID,Text_data,Label
0,train_pid_1,Waiting for my mind to have a breakdown once t...,1
1,train_pid_2,My new years resolution : I'm gonna get my ass...,1
2,train_pid_3,New year : Somone else Feeling like 2020 will ...,1
3,train_pid_4,"My story I guess : Hi, Im from Germany and my ...",1
4,train_pid_5,Sat in the dark and cried myself going into th...,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8891 entries, 0 to 8890
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        8891 non-null   object
 1   Text_data  8891 non-null   object
 2   Label      8891 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 208.5+ KB


In [None]:
df['Label'].value_counts()

1    6004
0    1985
2     902
Name: Label, dtype: int64

sentiment labels:
*   0 : no symptoms
*   1 : mild symptoms
*   2 : severe symptoms


# **initialize tokenizer from the BERT model**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df['Text_data'].iloc[1]

"My new years resolution : I'm gonna get my ass into a therapists office, and if I dont become even a little bit happy, then I'm not dealing with this shit anymore.\n\nI'm not asking for a lot, just a little bit of serotonin is all I want"

In [None]:
token = tokenizer.encode_plus(
    df['Text_data'].iloc[1],
    max_length = 256, 
    truncation = True, 
    padding = 'max_length', 
    add_special_tokens = True,
    return_tensors = 'tf'
)

In [None]:
token

{'input_ids': <tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  1422,  1207,  1201,  6021,   131,   146,   112,   182,
         6100,  1243,  1139,  3919,  1154,   170, 22573,  1116,  1701,
          117,  1105,  1191,   146,  1274,  1204,  1561,  1256,   170,
         1376,  2113,  2816,   117,  1173,   146,   112,   182,  1136,
         6705,  1114,  1142,  4170,  4169,   119,   146,   112,   182,
         1136,  4107,  1111,   170,  1974,   117,  1198,   170,  1376,
         2113,  1104, 14516, 10595, 11153,  1179,  1110,  1155,   146,
         1328,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

# **make zero vectors/ arrays**

*the input for the BERT model we need the input_ids and attention_mask*

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
print("X_input_ids shape: ", X_input_ids.shape)
print("X_attn_masks shape: ", X_attn_masks.shape)

X_input_ids shape:  (8891, 256)
X_attn_masks shape:  (8891, 256)


# **generate training data**

we're gonna populate the tokens converted using the tokenizer.encode_plus() method above into the X_input_ids and X_attn_mask zero vectors

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
  # iterate df of 'Phrase' column which consist of all the sentences in the dataframe
  for i, text in tqdm(enumerate(df['Text_data'])):
    tokenized_text = tokenizer.encode_plus(
        text,
        max_length = 256, 
        truncation = True, 
        padding = 'max_length', 
        add_special_tokens = True,
        return_tensors = 'tf'
    )
    ids[i, :] = tokenized_text.input_ids
    masks[i, :] = tokenized_text.attention_mask
  return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

define array's label with dimension len of df and number of classes

In [None]:
labels = np.zeros((len(df), 3)) # it's gonna be used as one hot encoded target vector
# e.g. we have 2 classes
# cat (0), dog (1)
# so [
#    --> first image [1, 0] (it's a cat cuz in index 0 the value is 1),
#    --> second image [0, 1] (it's a doog cuz in index 1 the value is 1)
#    ]

In [None]:
labels.shape

(8891, 3)

In [None]:
labels[np.arange(len(df)), df['Label'].values] = 1

In [None]:
labels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

# **data creation step**
using the tensor flow utility function that is the data set model

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [None]:
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

map the function that this dataset expect to return the batch



In [None]:
def DepressionDatasetMapFunction(input_ids, attn_masks, labels):
  return {
      'input_ids' : input_ids,
      'attention_mask' : attn_masks
  }, labels

In [None]:
dataset = dataset.map(DepressionDatasetMapFunction)

In [None]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

**do data shuffling**

data shuffling is a common task usually performed prior to model training in order to create more representative training and testing sets

try to experiment the type of iterating or training, using batch or epoch

In [None]:
dataset = dataset.shuffle(8000).batch(16, drop_remainder = True)

In [None]:
training_percentage = 0.8
train_size = int((len(df)//16)*training_percentage)

In [None]:
train_size

444

In [None]:
training_dataset = dataset.take(train_size)
validation_dataset = dataset.skip(train_size)

## **Model Creation**

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
input_ids = tf.keras.layers.Input(shape = (256,), name = 'input_ids', dtype = 'int32')
attention_masks = tf.keras.layers.Input(shape = (256,), name = 'attention_mask', dtype = 'int32')

# work embeddings layer using the BERT model we just defined
bert_embds = bert_model.bert(input_ids, attention_mask = attention_masks)[1] # 2 vector, first index is activation layer, second index is pool layer (ga paham)
intermediate_layer = tf.keras.layers.Dense(512, activation = 'relu', name = 'intermediate_layer')(bert_embds) # it takes an input of bert_embds
output_layer = tf.keras.layers.Dense(3, activation = 'softmax', name = 'output_layer')(intermediate_layer) # the activation is softwmax bcs we need to calculate the probability of the each class. and it takes intermediate layer for input

model = tf.keras.Model([input_ids, attention_masks], outputs = output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

we can use more than 1 intermediate layer, just experiment on that.
can add some additional layers like batch normalization, drop out, etc. (dont understand? google it.) 


next step is to define loss function, the accuracy matrix (confusion matrix), and the optimizer

*   https://www.tensorflow.org/api_docs/python/tf/keras/optimizers link for optimazer algorithm documentation in tf keras library
*   https://www.tensorflow.org/api_docs/python/tf/keras/losses link for losses function algorithm documentation in tf keras library
*   https://www.tensorflow.org/api_docs/python/tf/keras/metrics/CategoricalAccuracy guide to categorical accuracy matrix in tf keras



losses func : (we use categorical cross entropy)Cross-entropy loss is used when adjusting model weights during training. The aim is to minimize the loss, i.e, the smaller the loss the better the model

accuracy matrix : (we use categorical accuracy metrics) This metric creates two local variables, total and count that are used to compute the frequency with which y_pred matches y_true . This frequency is ultimately returned as categorical accuracy : an idempotent operation that simply divides total by count .


In [None]:
optim = tf.keras.optimizers.Adam(learning_rate = 1e-05, decay = 1e-06)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc_mtrx = tf.keras.metrics.CategoricalAccuracy('accuracy')

compile our model

In [None]:
model.compile(optimizer = optim, loss = loss_func, metrics = [acc_mtrx])

time to train our data on our validation dataset

https://stackoverflow.com/questions/37973005/what-do-model-predict-and-model-fit-do -> documentation for model.fit

In [None]:
hist = model.fit (
    training_dataset,
    validation_data = validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


save the model test it on raw dataset. to save the model we're gonna use simple method provided by tensorflow 

`model.save('name_of_our_file or the directory')`

e.g. `model.save('depression_analysis_model')`

In [None]:
model.save('/content/drive/MyDrive/ml/depression_analysis_model')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/ml/depression_analysis_model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/ml/depression_analysis_model/assets


load the model

In [None]:
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/ml/depression_analysis_model') # it's the model directory

# **let's do prediction with input given by user**

i think this part can be done in separate notebook, cuz we're only predict an input whether it's a depressive sentence or not (roughly speaking, it's the main class of a java project hehe)

* take input
* tokenize the input like we did in the data pre-processing step for the model and change the return type (so there's a lil update)

preparing the input_text

In [None]:
def prepare_data(input_text, tokenizer):
  token = tokenizer.encode_plus (
    input_text,
    max_length = 256, 
    truncation = True, 
    padding = 'max_length', 
    add_special_tokens = True,
    return_tensors = 'tf'      
  ) 
  
  return (
      tf.cast(token.input_ids, tf.float64), 
      tf.cast(token.attention_mask, tf.float64)
  )

In [None]:
def prediction(text):
  tokenized_input_text = prepare_data(input_text, tokenizer)
  probs = loaded_model.predict(tokenized_input_text)      #feed the tokenized input into the model
  output_index = np.argmax(probs[0])
  print()

  if output_index == 0:
    print('Diagnosis       : You are showing no symptoms of depression')
    print('Recommendation  : You are in a good condition. Keep it up! :)')
  elif output_index == 1:
    print('Diagnosis       : You are showing mild symptoms of depression.')
    print('Recommendation  : You are recommended to under going a social engagement. Don\'t worry! You will be ok :)')
  else:
    print('Diagnosis       : You are showing severe symptoms of depression.')
    print('Recommendation  : You are recommended to seek medical treatment (e.g. consult a psychiatrist or psychologist). Hang in there! Everything will be fine :)')


In [None]:
input_text = input("Enter your sentiment: ")
prediction(input_text)