# Import librairies

In [None]:
!pip3 install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html

In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS
import string
import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import numpy as np

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from imblearn.over_sampling import SMOTE

In [55]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3060 Ti


# Import data

In [28]:
df = pd.read_csv("../data/mbti_1.csv", encoding='latin-1')

In [31]:
df['posts'] = df['posts'].apply(lambda x: x[1:-2])

In [32]:
for lab in ["INFP","INFJ","INTP","INTJ","ENTP","ENFP"]:
    a = df[df["type"]==lab].sample(250)
    df = df[df["type"]!=lab]
    df = pd.concat([df,a])
    df.reset_index(drop=True, inplace=True)

# Pre-processing text

## Lowercase

In [34]:
df["posts"] = df["posts"].str.lower()

## Remove stopwords

In [35]:
def filter_preprocess(x):
    # Delete links
    x = " ".join(filter(lambda y: not(y.startswith("http")), x.split()))
    x = " ".join(filter(lambda y: not(y.endswith(".com")), x.split()))
    
    # Remove stop words
    x = " ".join(filter(lambda y: not(y in stopwords.words('english')), x.split()))
    
    x = " ".join(filter(lambda y: y[0]!="@", x.split()))
    
    return x

df["posts"] = df["posts"].apply(filter_preprocess)

## Replace emojis with text

https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html

https://medium.com/geekculture/text-preprocessing-how-to-handle-emoji-emoticon-641bbfa6e9e7

In [36]:
# 'Emoji_Dict.p'- download link https://drive.google.com/open?id=1G1vIkkbqPBYPKHcQ8qy0G2zkoab2Qv4v


with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}



# def convert_emojis_to_word(text):
#     regex = re.compile("|".join(map(re.escape, Emoji_Dict.keys(  ))))
#     # For each match, look up the corresponding value in the dictionary
#     text = regex.sub(lambda match: " ".join(Emoji_Dict[match.group(0).replace("_"," ").replace(",","").replace(":","").split()]), text)
#     text = re.sub("\s\s+" , " ", text)
#     # for emot in Emoji_Dict:
#     #     text = re.sub(emot,r' '+emot, text)
#     #     text = re.sub("\s\s+" , " ", text)
#     #     text = re.sub(r'('+emot+')', " ".join(Emoji_Dict[emot].replace("_"," ").replace(",","").replace(":","").split()), text)
#     return text

def convert_emojis_to_word(text):
    # print(text)
    regex = re.compile("|".join(map(re.escape, Emoji_Dict.keys(  ))))
    # print(regex.sub(lambda match: Emoji_Dict[match.group(0)], text))
    # print(regex.sub(lambda match: Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," "), text))
    text = regex.sub(lambda match: Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," "), text)
    # text = regex.sub(lambda match: " ".join(Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," ").split()), text)
    text = re.sub("\s\s+" , " ", text)
    return text

df["posts"] = df["posts"].apply(convert_emojis_to_word)

## Replace abbreviations

In [37]:

abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    
    'shoulda': 'should have',
    'gonna': 'going to', 
    'wanna': 'wanting to',
    "ain't": "is not",
    "wana": 'wanting to',
   
    
    'ngl': 'not going to lie',
    'idk': 'i do not know',
    'fyi': 'for your information',
    'tbh': 'to be honest',
    'asap': 'as soon as possible',
    'bbiab': 'be back in a bit',
    'bbl': 'be back later',
    'bbs': 'be back soon',
    'bf': 'boyfriend',
    'bff': 'best friend forever',
    'brb': 'be right back',
    'cya': 'see you',
    'faq': 'frequently asked questions',
    'ftw': 'for the win',
    'g2g': 'got to go',
    'gf': 'girlfriend',
    'gr8': 'great',
    'hru': 'how are you',
    'ight': 'alright',
    'imo': 'in my opinion',
    'imy': 'i miss you',
    'irl': 'in real life',
    'istg': 'i swear',
    'lmao': 'laughing',
    'lmk': 'let me know',
    'lol': 'laughing',
    'nvd': 'nevermind',
    'noob': 'amateur',
    ' np ': ' no problem ',
    'ofc': 'of course',
    'omg': 'i can not believe it',
    'rn': 'right now',
    'ttyl': 'talk to you later',
    ' u ': ' you ',
    'wym': 'what do you mean ?',
    ' y ': ' why ',
    'yw': 'you are welcome'
    
}


def replace_abbreviations(text):
    regex = re.compile("|".join(map(re.escape, abbr_dict.keys(  ))))
    text = regex.sub(lambda match: abbr_dict[match.group(0)], text)
    return text
df["posts"] = df["posts"].apply(replace_abbreviations)

## Handle punctiation

In [38]:
punctiations = {}
for elem in string.punctuation:
    punctiations[elem] = f" {elem} "
punctiations["..."] = " ... "
def handle_punctiation(text):
    regex = re.compile("|".join(map(re.escape, punctiations.keys(  ))))
    text = regex.sub(lambda match: punctiations[match.group(0)], text)
    text = re.sub("\s\s+" , " ", text)
    return text
df["posts"] = df["posts"].apply(handle_punctiation)

## Handle label

In [39]:
df["extraversion"] = df["type"].apply(lambda x: x[0])
df["sensing"] = df["type"].apply(lambda x: x[1])
df["thinking"] = df["type"].apply(lambda x: x[2])
df["judging"] = df["type"].apply(lambda x: x[3])

In [40]:
df["extraversion"] = df["extraversion"].apply(lambda x: 0 if x=='I' else 1)
df["sensing"] = df["sensing"].apply(lambda x: 0 if x=='N' else 1)
df["thinking"] = df["thinking"].apply(lambda x: 0 if x=='F' else 1)
df["judging"] = df["judging"].apply(lambda x: 0 if x=='P' else 1)

In [41]:
df.head()

Unnamed: 0,type,posts,extraversion,sensing,thinking,judging
0,ENTJ,fired . | | | that is another silly misconcept...,1,0,1,1
1,ENFJ,: o | | | i went break months ago . together ...,1,0,0,1
2,ENTJ,"i am interested . lazy go research it , time -...",1,0,1,1
3,ENTJ,still going strong two year mark . made notice...,1,0,1,1
4,ISFP,paint without numbers | | | i ' d guess istp t...,0,1,0,0


In [42]:
# one_hot = pd.get_dummies(df['type'])
# # Drop column B as it is now encoded
# df = df.drop('type',axis = 1)
# # Join the encoded df
# df = df.join(one_hot)

# Tokenize and encode words

In [43]:
from tqdm import tqdm
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [44]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Create empty lists to store outputs
global input_ids
input_ids = []
global attention_masks
attention_masks = []
MAX_LEN = 100
# Create a function to tokenize a set of texts
def preprocessing_for_bert(out,x):
    """Perform required preprocessing steps for pretrained BERT.
    @param    df (dataframe): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    
    # `encode_plus` will:
    #    (1) Tokenize the sentence
    #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
    #    (3) Truncate/Pad sentence to max length
    #    (4) Map tokens to their IDs
    #    (5) Create attention mask
    #    (6) Return a dictionary of outputs
    x = tokenizer.encode_plus(
        x,
        max_length=MAX_LEN,                  # Max length to truncate/pad
        pad_to_max_length=True, 
        add_special_tokens=True,   
        # pad_to_max_length=True,         # Pad sentence to max length
        #return_tensors='pt',           # Return PyTorch tensor
        return_attention_mask=True      # Return attention mask
        )  
    return x.get(out)
 

df["ids"] = df["posts"].apply(lambda x:  preprocessing_for_bert('input_ids',x))
df["masks"] = df["posts"].apply(lambda x:  preprocessing_for_bert('attention_mask',x))
# Convert lists to tensors
# input_ids = torch.tensor(input_ids)
# attention_masks = torch.tensor(attention_masks)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [45]:
df.head()

Unnamed: 0,type,posts,extraversion,sensing,thinking,judging,ids,masks
0,ENTJ,fired . | | | that is another silly misconcept...,1,0,1,1,"[101, 5045, 1012, 1064, 1064, 1064, 2008, 2003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,ENFJ,: o | | | i went break months ago . together ...,1,0,0,1,"[101, 1024, 1051, 1064, 1064, 1064, 1045, 2253...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,ENTJ,"i am interested . lazy go research it , time -...",1,0,1,1,"[101, 1045, 2572, 4699, 1012, 13971, 2175, 247...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,ENTJ,still going strong two year mark . made notice...,1,0,1,1,"[101, 2145, 2183, 2844, 2048, 2095, 2928, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,ISFP,paint without numbers | | | i ' d guess istp t...,0,1,0,0,"[101, 6773, 2302, 3616, 1064, 1064, 1064, 1045...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Train/test split

In [46]:
from sklearn.model_selection import train_test_split

# X = df["text"]
# y = df["target"]
train_inputs, val_inputs, train_masks, val_masks,  y_train, y_val = train_test_split(df["ids"],df["masks"], df.drop(columns=["posts","ids","masks","type"]), test_size=0.1, random_state=202)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=202)

# Dataset to feed to NN

In [47]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train.to_numpy()).float()
val_labels = torch.tensor(y_val.to_numpy()).float()


# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset( torch.tensor(np.array(train_inputs.to_list())), torch.tensor(np.array(train_masks.to_list())), train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(torch.tensor(np.array(val_inputs.to_list())), torch.tensor(np.array(val_masks.to_list())), val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [48]:
val_labels.shape

torch.Size([312, 4])

In [49]:
train_labels.shape

torch.Size([2806, 4])

# Model

In [50]:
from transformers import BertModel
# Create the BertClassfier class
class BertClassifier(nn.Module):
    """
    Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 4

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, 3*H),
            nn.ReLU(),
            nn.Linear(3*H, 2*H),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(2*H, H),
            nn.ReLU(),
            nn.Linear(H, D_out),
            nn.Sigmoid()
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [51]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [52]:
import random
import time

# Specify loss function
loss_fn = torch.nn.BCELoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)
            # print(logits,b_labels)
            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 10 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            # 
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            #
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        # preds = nn.Softmax(logits)
        
        preds = torch.cuda.FloatTensor(np.array([[1 if elem>=0.6 else 0 for elem in rows]for rows in logits.cpu().numpy()]),device=device)
        # preds.to(torch.device('cuda:0'))
        # print(preds,b_labels)
        # Calculate the accuracy rate
        accuracy = 100*(preds == b_labels).cpu().numpy().mean()  
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss , val_accuracy

In [53]:
# !pip3 install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [56]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader,val_dataloader, epochs=5, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   10    |   0.683420   |     -      |     -     |   3.03   
   1    |   20    |   0.676014   |     -      |     -     |   2.22   
   1    |   30    |   0.678745   |     -      |     -     |   2.26   
   1    |   40    |   0.678685   |     -      |     -     |   2.19   
   1    |   50    |   0.679940   |     -      |     -     |   2.10   
   1    |   60    |   0.674629   |     -      |     -     |   2.28   
   1    |   70    |   0.679295   |     -      |     -     |   2.29   
   1    |   80    |   0.673437   |     -      |     -     |   2.17   
   1    |   87    |   0.674700   |     -      |     -     |   1.47   
----------------------------------------------------------------------
   1    |    -    |   0.677818   |  0.677519  |   55.65   |   20.61  
---------------------------------------------------------------------

# Experimental (stemming)

In [65]:
a = df[["extraversion","sensing","thinking","judging"]].apply(lambda x: [x["extraversion"],x["sensing"],x["thinking"],x["judging"]], axis=1)

In [67]:
a.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[0, 1, 1, 0]    337
[0, 1, 0, 0]    271
[0, 0, 0, 0]    250
[0, 0, 0, 1]    250
[0, 0, 1, 0]    250
[0, 0, 1, 1]    250
[1, 0, 1, 0]    250
[1, 0, 0, 0]    250
[1, 0, 1, 1]    231
[0, 1, 1, 1]    205
[1, 0, 0, 1]    190
[0, 1, 0, 1]    166
[1, 1, 1, 0]     89
[1, 1, 0, 0]     48
[1, 1, 0, 1]     42
[1, 1, 1, 1]     39
dtype: int64

In [57]:
df["extraversion"].value_counts()

0    1979
1    1139
Name: extraversion, dtype: int64

In [58]:
df["sensing"].value_counts()

0    1921
1    1197
Name: sensing, dtype: int64

In [59]:
df["thinking"].value_counts()

1    1651
0    1467
Name: thinking, dtype: int64

In [60]:
df["judging"].value_counts()

0    1745
1    1373
Name: judging, dtype: int64

In [None]:
from nltk.stem.snowball import SnowballStemmer

englishStemmer=SnowballStemmer("english")

In [None]:
englishStemmer.keys()

In [None]:
!pip3 install emot