In [50]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from transformers import AutoTokenizer, TFAutoModel
import pandas as pd
#nltk.download('punkt')
#nltk.download('all')

In [52]:
#FinanceInc/finbert-pretrain
finbert_tokenizer = AutoTokenizer.from_pretrained("FinanceInc/finbert-pretrain", from_pt=True)
finbert_model = TFAutoModel.from_pretrained("FinanceInc/finbert-pretrain", from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [53]:
finbert_model.summary()

Model: "tf_bert_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109751808 
                                                                 
Total params: 109,751,808
Trainable params: 109,751,808
Non-trainable params: 0
_________________________________________________________________


In [54]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    # Tokenization
    # tokens = nltk.word_tokenize(text)
    # Remove stop words and lemmatize the words
    # tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return text

input_text = '\n**User Report**| | | |\n:--|:--|:--|:--\n**Total Submissions**|10|**First Seen In WSB**|2 years ago\n**Total Comments**|332|**Previous Best DD**|\n**Account Age**|3 years|[^scan ^comment ](https://www.reddit.com/message/compose/?to=VisualMod&subject=scan_comment&message=Replace%20this%20text%20with%20a%20comment%20ID%20(which%20looks%20like%20h26cq3k\\)%20to%20have%20the%20bot%20scan%20your%20comment%20and%20correct%20your%20first%20seen%20date.)|[^scan ^submission ](https://www.reddit.com/message/compose/?to=VisualMod&subject=scan_submission&message=Replace%20this%20text%20with%20a%20submission%20ID%20(which%20looks%20like%20h26cq3k\\)%20to%20have%20the%20bot%20scan%20your%20submission%20and%20correct%20your%20first%20seen%20date.)'
preprocessed_text = preprocess_text(input_text)
print(preprocessed_text)

def decontractions(phrase):
    """decontracted takes text and convert contractions into natural form.
     ref: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490"""
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)


    return phrase

   user report                            total submissions   10   first seen in wsb   2 years ago   total comments   332   previous best dd      account age   3 years   scan  comment     submission   


In [55]:
qna_df_reddit = pd.read_csv('qna_df_reddit.csv')
qna_df_reddit = qna_df_reddit.sample(frac=1).reset_index(drop=True)
qna_df_reddit

Unnamed: 0,questions,answers,tags
0,trader who made money with tight stoploss ther...,cut losses early take profits always simple c...,stockmarket
1,leaked email from microsoft ceo says salaried ...,we have exceeded all profit expectations tha...,stockmarket
2,does anyone know where sbf is now,i ve worked with a few people that have gotten...,wallstreetbets
3,december buys cvna 182k to 2 500 000,user report tota...,wallstreetbets
4,the dumbest reason in the world to buy a stoc...,i don t think those quotes are that conflictin...,stockmarket
...,...,...,...
41008,next week on stock market,it seems to be mimicking my total account value,wallstreetbets
41009,whatcha doin here huh,this is funny the guy with glass actually got...,stockmarket
41010,commerzbank ag down 12 credit suisse group a...,my portfolio is up 1 08 today so far also ...,stockmarket
41011,coca cola ko vs pepsi pep are either wo...,izbrisano,dividends


In [56]:
qna_df_reddit = qna_df_reddit.map(str)


In [57]:
qna_df_reddit['questions'] = qna_df_reddit['questions'].apply(lambda x: np.nan if re.match(r'^\s*$', x) else x)
qna_df_reddit['answers'] = qna_df_reddit['answers'].apply(lambda x: np.nan if re.match(r'^\s*$', x) else x)

qna_df_reddit = qna_df_reddit.dropna().reset_index(drop=True)
qna_df_reddit

Unnamed: 0,questions,answers,tags
0,trader who made money with tight stoploss ther...,cut losses early take profits always simple c...,stockmarket
1,leaked email from microsoft ceo says salaried ...,we have exceeded all profit expectations tha...,stockmarket
2,does anyone know where sbf is now,i ve worked with a few people that have gotten...,wallstreetbets
3,december buys cvna 182k to 2 500 000,user report tota...,wallstreetbets
4,the dumbest reason in the world to buy a stoc...,i don t think those quotes are that conflictin...,stockmarket
...,...,...,...
40801,next week on stock market,it seems to be mimicking my total account value,wallstreetbets
40802,whatcha doin here huh,this is funny the guy with glass actually got...,stockmarket
40803,commerzbank ag down 12 credit suisse group a...,my portfolio is up 1 08 today so far also ...,stockmarket
40804,coca cola ko vs pepsi pep are either wo...,izbrisano,dividends


In [58]:
qna_df_reddit['questions'] = qna_df_reddit['questions'].apply(lambda x: preprocess_text(x))
qna_df_reddit['answers'] = qna_df_reddit['answers'].apply(lambda x: preprocess_text(x))

qna_df_reddit['questions'] = qna_df_reddit['questions'].apply(lambda x: decontractions(x))
qna_df_reddit['answers'] = qna_df_reddit['answers'].apply(lambda x: decontractions(x))

qna_df_reddit['question_tokens'] = qna_df_reddit['questions'].apply(lambda x: finbert_tokenizer.tokenize(x))
qna_df_reddit['answer_tokens'] = qna_df_reddit['answers'].apply(lambda x: finbert_tokenizer.tokenize(x))

qna_df_reddit['question_tokens'] = qna_df_reddit['question_tokens'].apply(lambda x: ['[CLS]'] + x + ['[SEP]'])
qna_df_reddit['answer_tokens'] = qna_df_reddit['answer_tokens'].apply(lambda x: ['[CLS]'] + x + ['[SEP]'])

qna_df_reddit['question_ids'] = qna_df_reddit['question_tokens'].apply(lambda x: finbert_tokenizer.convert_tokens_to_ids(x))
qna_df_reddit['answer_ids'] = qna_df_reddit['answer_tokens'].apply(lambda x: finbert_tokenizer.convert_tokens_to_ids(x))


In [59]:
qna_df_reddit

Unnamed: 0,questions,answers,tags,question_tokens,answer_tokens,question_ids,answer_ids
0,trader who made money with tight stoploss ther...,cut losses early take profits always simple c...,stockmarket,"[[CLS], trade, ##r, who, made, money, with, ti...","[[CLS], cut, losses, early, take, profits, alw...","[3, 582, 437, 412, 295, 1247, 20, 4763, 4001, ...","[3, 2726, 274, 634, 362, 1458, 1433, 4665, 499..."
1,leaked email from microsoft ceo says salaried ...,we have exceeded all profit expectations tha...,stockmarket,"[[CLS], leak, ##ed, email, from, microsoft, ce...","[[CLS], we, have, exceeded, all, profit, expec...","[3, 12641, 268, 4679, 23, 3466, 2303, 8276, 19...","[3, 13, 29, 2365, 69, 358, 746, 15, 304, 960, ..."
2,does anyone know where sbf is now,i ve worked with a few people that have gotten...,wallstreetbets,"[[CLS], does, anyone, know, where, sb, ##f, is...","[[CLS], i, ve, worked, with, a, few, people, t...","[3, 262, 7355, 695, 214, 7775, 1423, 17, 212, 4]","[3, 44, 829, 4080, 20, 11, 806, 1043, 15, 29, ..."
3,december buys cvna 182k to 2 500 000,user report tota...,wallstreetbets,"[[CLS], december, buys, cv, ##na, 182, ##k, to...","[[CLS], user, report, total, submissions, 6, f...","[3, 109, 7233, 7314, 2886, 17104, 994, 9, 513,...","[3, 2502, 125, 87, 13777, 1146, 78, 921, 10, 7..."
4,the dumbest reason in the world to buy a stoc...,i don t think those quotes are that conflictin...,stockmarket,"[[CLS], the, dum, ##bes, ##t, reason, in, the,...","[[CLS], i, don, t, think, those, quotes, are, ...","[3, 6, 18997, 25137, 463, 2040, 10, 6, 907, 9,...","[3, 44, 4950, 599, 135, 151, 12244, 21, 15, 24..."
...,...,...,...,...,...,...,...
40801,next week on stock market,it seems to be mimicking my total account value,wallstreetbets,"[[CLS], next, week, on, stock, market, [SEP]]","[[CLS], it, seems, to, be, mimi, ##cki, ##ng, ...","[3, 165, 1952, 19, 93, 52, 4]","[3, 41, 1998, 9, 25, 27272, 26497, 1071, 657, ..."
40802,whatcha doin here huh,this is funny the guy with glass actually got...,stockmarket,"[[CLS], what, ##cha, do, ##in, here, hu, ##h, ...","[[CLS], this, is, fun, ##ny, the, guy, with, g...","[3, 163, 9118, 123, 1419, 1094, 10374, 1078, 4]","[3, 26, 17, 15782, 5233, 6, 10037, 20, 4639, 8..."
40803,commerzbank ag down 12 credit suisse group a...,my portfolio is up 1 08 today so far also ...,stockmarket,"[[CLS], commer, ##zb, ##ank, ag, down, 12, cre...","[[CLS], my, portfolio, is, up, 1, 08, today, s...","[3, 27903, 21596, 8515, 2708, 269, 315, 97, 11...","[3, 657, 318, 17, 129, 428, 7694, 1163, 96, 12..."
40804,coca cola ko vs pepsi pep are either wo...,izbrisano,dividends,"[[CLS], coc, ##a, cola, ko, vs, pepsi, pep, ar...","[[CLS], i, ##zb, ##ris, ##ano, [SEP]]","[3, 16632, 363, 25424, 5338, 4364, 10987, 1327...","[3, 44, 21596, 6632, 10225, 4]"
