# T5 Translation Model

Summary:
* Replace slang text in reddit posts with de-slanged text
* Manually check replaced text to ensure posts make sense
* Train a T5 model on checked de-slanged text

In [80]:
from __future__ import print_function
import ipywidgets as widgets
from transformers import pipeline
from simpletransformers.t5 import T5Model, T5Args
import pandas as pd
import logging
import numpy as np
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
import json
import re
# Pull reddit data from reddit api
import requests
pd.options.display.max_colwidth = 1000
pd.set_option('display.max_rows', 100)

## Pull Reddit Data

In [2]:
# note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
auth = requests.auth.HTTPBasicAuth('pigKA_TKnDkXcatEGcbo8g', 'nawGKK2MfPtC6vKz8TjaNEnmYfAggA')

# here we pass our login method (password), username, and password
data = {'grant_type': 'password',
        'username': 'Katsuuu100',
        'password': 'Testing159753'}

# setup our header info, which gives reddit a brief description of our app
headers = {'User-Agent': 'MyBot/0.0.1'}

# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [3]:
# Source: https://towardsdatascience.com/how-to-use-the-reddit-api-in-python-5e05ddfd1e5c

# Pull from 5 classes. 5 classes = 5 subreddits.

my_list_of_dictionaries = []
total = 0
url_list_check = [f"https://oauth.reddit.com/r/wallstreetbets/new/",
          f"https://oauth.reddit.com/r/teenagers/new/",
          f"https://oauth.reddit.com/r/copypasta/new/",
          f"https://oauth.reddit.com/r/genz/new/",
          f"https://oauth.reddit.com/r/unpopularopinion/new/",
          f"https://oauth.reddit.com/r/frat/new/"
          ]

for i in range(len(url_list_check)):
    
  #print(url_list_check[i])
  res_check = requests.get(url_list_check[i],
                    headers=headers,
                    params={"limit": "1"})
  
  # print(json.dumps(res_check.json()["data"]["children"], indent=4))
  name = res_check.json()["data"]["children"][0]["data"]["name"]
  page_count = 25
  
  for j in range(4):

    url_list = [f"https://oauth.reddit.com/r/wallstreetbets/new/?count={page_count}&after={name}",
            f"https://oauth.reddit.com/r/teenagers/new/?count={page_count}&after={name}",
            f"https://oauth.reddit.com/r/copypasta/new/?count={page_count}&after={name}",
            f"https://oauth.reddit.com/r/genz/new/?count={page_count}&after={name}",
            f"https://oauth.reddit.com/r/unpopularopinion/new/?count={page_count}&after={name}",
            f"https://oauth.reddit.com/r/frat/new/?count={page_count}&after={name}"
            ]
    
    print("Page Count:", page_count)
    print("Name:", name)
    print("Url:", url_list[i])
    
    res = requests.get(url_list[i],
                    headers=headers)
                    # params={"limit": "100"})

    #print(json.dumps(res.json(), indent=4, sort_keys=False))

    reddit_dictionary = res.json()
    # print(json.dumps(reddit_dictionary["data"]["children"][0], indent=4, sort_keys=False))
    # print(len(reddit_dictionary["data"]["children"]))
    for k in range(len(reddit_dictionary["data"]["children"])):
      my_dictionary = {}
      my_dictionary["subreddit"] = reddit_dictionary["data"]["children"][k]["data"]["subreddit"]
      my_dictionary["text"] = reddit_dictionary["data"]["children"][k]["data"]["selftext"]
      my_dictionary["title"] = reddit_dictionary["data"]["children"][k]["data"]["title"]
      my_dictionary["url"] = reddit_dictionary["data"]["children"][k]["data"]["url"]
      # print(reddit_dictionary["data"]["children"][k]["data"]["subreddit"])
      # print(reddit_dictionary["data"]["children"][k]["data"]["selftext"])
      # print(reddit_dictionary["data"]["children"][k]["data"]["url"])
      my_list_of_dictionaries.append(my_dictionary)
      total += 1
      name = reddit_dictionary["data"]["children"][k]["data"]["name"]
    # print(json.dumps(my_list_of_dictionaries, indent=4, sort_keys=False))
    
    page_count += 25

print("Total gathered:", total)

Page Count: 25
Name: t3_tqmo3g
Url: https://oauth.reddit.com/r/wallstreetbets/new/?count=25&after=t3_tqmo3g
Page Count: 50
Name: t3_tqk09e
Url: https://oauth.reddit.com/r/wallstreetbets/new/?count=50&after=t3_tqk09e
Page Count: 75
Name: t3_tqijrv
Url: https://oauth.reddit.com/r/wallstreetbets/new/?count=75&after=t3_tqijrv
Page Count: 100
Name: t3_tqgpun
Url: https://oauth.reddit.com/r/wallstreetbets/new/?count=100&after=t3_tqgpun
Page Count: 25
Name: t3_tqmyw3
Url: https://oauth.reddit.com/r/teenagers/new/?count=25&after=t3_tqmyw3
Page Count: 50
Name: t3_tqmq7o
Url: https://oauth.reddit.com/r/teenagers/new/?count=50&after=t3_tqmq7o
Page Count: 75
Name: t3_tqmjqb
Url: https://oauth.reddit.com/r/teenagers/new/?count=75&after=t3_tqmjqb
Page Count: 100
Name: t3_tqmbxq
Url: https://oauth.reddit.com/r/teenagers/new/?count=100&after=t3_tqmbxq
Page Count: 25
Name: t3_tqmqtv
Url: https://oauth.reddit.com/r/copypasta/new/?count=25&after=t3_tqmqtv
Page Count: 50
Name: t3_tqj4ab
Url: https://oauth

In [4]:
# Source: https://github.com/susanli2016/NLP-with-Python/blob/master/Text_Classification_With_BERT.ipynb
# Convert list of dictionaries into pandas df
df = pd.DataFrame(my_list_of_dictionaries)
df.head()

Unnamed: 0,subreddit,text,title,url
0,wallstreetbets,"My ""aggressive"" portfolio that my automated in...",Banks should have an automated YOLO portfolio,https://www.reddit.com/r/wallstreetbets/commen...
1,wallstreetbets,,Do I belong?!?,https://i.redd.it/qxbg306fi7q81.jpg
2,wallstreetbets,Preface: I don’t know a ton about options &amp...,What’s actually driving GME?,https://www.reddit.com/r/wallstreetbets/commen...
3,wallstreetbets,,Stopped trading options for a few months. Put ...,https://www.reddit.com/gallery/tqlxre
4,wallstreetbets,,I do not know when to pull out.,https://i.redd.it/vtrjq2aih7q81.jpg


In [5]:
df['subreddit'].value_counts()

GenZ                100
wallstreetbets      100
Frat                100
teenagers           100
unpopularopinion    100
copypasta           100
Name: subreddit, dtype: int64

In [7]:
possible_labels = df.subreddit.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'wallstreetbets': 0,
 'teenagers': 1,
 'copypasta': 2,
 'GenZ': 3,
 'unpopularopinion': 4,
 'Frat': 5}

In [8]:
df['label'] = df.subreddit.replace(label_dict)
df.head()

Unnamed: 0,subreddit,text,title,url,label
0,wallstreetbets,"My ""aggressive"" portfolio that my automated in...",Banks should have an automated YOLO portfolio,https://www.reddit.com/r/wallstreetbets/commen...,0
1,wallstreetbets,,Do I belong?!?,https://i.redd.it/qxbg306fi7q81.jpg,0
2,wallstreetbets,Preface: I don’t know a ton about options &amp...,What’s actually driving GME?,https://www.reddit.com/r/wallstreetbets/commen...,0
3,wallstreetbets,,Stopped trading options for a few months. Put ...,https://www.reddit.com/gallery/tqlxre,0
4,wallstreetbets,,I do not know when to pull out.,https://i.redd.it/vtrjq2aih7q81.jpg,0


In [21]:
#combine title & text to make one column
df['title_and_text'] = df['title'] + ' ' +  df['text']
df['title_and_text'].head()

0    Banks should have an automated YOLO portfolio ...
1                                      Do I belong?!? 
2    What’s actually driving GME? Preface: I don’t ...
3    Stopped trading options for a few months. Put ...
4                     I do not know when to pull out. 
Name: title_and_text, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

In [11]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['subreddit', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,title,url
subreddit,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Frat,5,train,85,85,85
Frat,5,val,15,15,15
GenZ,3,train,85,85,85
GenZ,3,val,15,15,15
copypasta,2,train,85,85,85
copypasta,2,val,15,15,15
teenagers,1,train,85,85,85
teenagers,1,val,15,15,15
unpopularopinion,4,train,85,85,85
unpopularopinion,4,val,15,15,15


## Load Slangit Data

Slangit is a direct translation of slang data

In [12]:
slang = pd.read_csv('slangit.csv')
slang.head(20)

Unnamed: 0,Slang Term,Meaning
0,*$,Starbucks
1,*$$,Starbucks
2,2,Two cents
3,0773H,Hello
4,10m,Ten man
5,10q,Thank you
6,10x,Thanks
7,1174,Meet in person at
8,121,One to one
9,1337,Leet


In [46]:
slangit_dict = slang.set_index('Slang Term').to_dict()
slangit_dict = slangit_dict['Meaning']

keys_values = d.items()
slangit_dict = {str(key): str(value) for key, value in slangit_dict}

ValueError: too many values to unpack (expected 2)

In [47]:
keys_values = slangit_dict.items()
slangit_dict = {str(key): str(value) for key, value in keys_values}

In [48]:
for i in slangit_dict:
    print(i)

*$
*$$
2
0773H
10m
10q
10x
1174
121
1337
143
1432
143444
1437
14AA41
182
187
1aat
1B
1ce
1D
1F
1i
1L
1NAM
1TG
1up
2.5D
20
22
224
24/7
241
25m
262
26y4u
2B
2B@
2BZ4UQT
2C
2C4U
2day
2DFM
2EZ
2FA
2FB
2G2BT
2H2H
2k
2l8
2M2H
2M2M
2MFM
2MI
2moro
2morrow
2moz
2mr
2mrw
2nite
2nt
2nte
2TG
2U2
2zda
303
360-noscope
3arc
3B
3EB
3G
3q
3rzda
3st
3sum
3TG
4
404
411
420
44
45
458
459
46
4AO
4AYN
4COL
4EAE
4eva
4ever
4G
4get
4gm
4got
4MTK
4n
4nr
4OTS
4q
4rl
4sale
4sho
4U
4U2
4ward
4WD
50/50
511
5150
53x
555
5e
5FDP
5FS
5G
5Head
5SOS
6y
7DAW
8
831
8TB
9
99
@@-o
@TEOTD
^5
{}
A
a&f
A/S/L
A/S/L/P
A2D
A2T
A3
A7X
AA
aa
AAA
AAF
AAK
AAMOF
AAP
AAR
aar8
Aarping
AAS
AATK
AAVE
AAWC
AAYF
AB
Abandonware
abbrev
ABCP
ABD
Abominable snowman
Abow
ABP
abt
abt2
AC
ACC
acc
Accountant
ACD
Ace
ack
ACL
ACLS
ACNH
ACO
ACV
AD
AD&D
ADAD
ADAM
ADBB
ADC
add
ADD
adds
addy
ADHD
ADIH
ADIP
ADM
admin
ADN
Adorbs
Adorkable
ADP
ADPIC
ADS
Adulting
AE
AEAP
AED
AF
AFAIAA
AFAIAC
AFAIC
AFAICS
AFAICT
AFAIK
AFAIR
AFAP
AFAYC
AFB
AFC
AFDA
AFDB
AFHV


Smol
Smooch
SMP
SMS
SMTW
SMUN
Smurf
SN
Snacc
Snaccident
Snack
SNAFU
Snake draft
Snap
Snapback
Snapback culture
Snapchatter
Snapstreak
Snatched
Snatched my weave
Sneakerhead
SNERT
SNES
SNF
SNG
SNH
Snitches get stitches
SNL
SNMP
Snoozefeed
Snoozle
Snowflake
Snowman
sntnc
SO
SO8
SOA
SOAB
SOAD
Soaps
SOB
Soccer mom
Social distancing
Social engineering
Sock puppet account
SoD
Soft
Soft parenting
Softie
SOH
SOHF
sok
SOL
Solange
Somepony
SOMSW
SOMY
SOOF
SOOI
SOP
Sorority squat
sorta
SOS
SOSG
SOSO
SOTA
SOTD
SOTU
Southpaw
SOW
Soy boy
soz
sp
SP
Spacing
Spam
Spammer
Spammy
Sparrowface
spd
SPD
Speak truth to power
spec
Speedling
Speedrun
Speedrunner
Spicy
Spill the T
Spill the tea
spim
Spit
spk
spkr
Spleef
Spoopy
SPPU
Sprang break
Spreading the salt
SPROS
SPST
Squad
Squad goals
Squidward
Squinching
Squirrel
Squishy
SRO
srry
srs
srsly
Sry
SS
ss
SSA
SSD
SSDD
SSI
SSIA
SSL
SSN
SSP
ST
ST2M
stache
Stack
stahp
Stan
Stanning
STAR
Stashing
stats
Stay in your lane
Stay woke
Staycation
STBY
STD
Stee
Steeze
St

In [None]:
key, value = next((str(k), str(v)) for k, v in d.items())
print(type(key))
print(type(value))

In [50]:
def slang_lookup(text, dictionary):
    
    pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in slangit_dict.keys()) + r')(?!\w)')
    result = pattern.sub(lambda x: dictionary[x.group()], text)

    return result

In [51]:
my_text = 'I watched the UNC game at a bar b/c YOLO, FTW'

print(slang_lookup(my_text, slangit_dict))

I watched the UNC game at a bar Be/See You only live once, For the win


## Apply Slangit regex replace to reddit data

In [22]:
df_train = df[df.index.isin(X_train)]
len(df_train)

510

In [54]:
df_train['title_and_text_deslanged'] = df['title_and_text'].apply(lambda x: slang_lookup(x, slangit_dict))
df_train['title_and_text_deslanged'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['title_and_text_deslanged'] = df['title_and_text'].apply(lambda x: slang_lookup(x, slangit_dict))


0    Banks should have an automated You only live o...
1                                      Do I belong?!? 
3    Stopped trading options for a few months. Put ...
4                     I do not know when to pull out. 
5    last time gme/amc went Underpowered, nok follo...
Name: title_and_text_deslanged, dtype: object

In [59]:
df_train['title_and_text_deslanged'].head(30)

0                                                                                                                                                                         Banks should have an automated You only live once portfolio My "aggressive" portfolio that my automated investments are softer than the mattress that my wife'Sarcasm boyfriend sleeps on with her. \n\nBanks should have offer a You only live once portfolio for single Intelligence quotient degenerates like me that go for moonshots.  They will look for tickers like GME and AMC and try to buy low and sell high, or even buy some FD calls that will either moon or go lower than the depth of Cramer'Sarcasm basement. \n\nAll the bank needs to do is make full disclosures and educate the "investor" that they belong on the shortbus to invest in this, and have a 30 day wait period so that the paper hands can back out. \n\nI'Di** rather risk losing all my money in this casino than to see my "aggressive" portfolio beat S&To be loud an

In [65]:
#df_train['same'] = df_train['title_and_text'].equals(df_train['title_and_text_deslanged'])
df_train['same'] = np.where(df_train['title_and_text'] == df_train['title_and_text_deslanged'] , True, False)
df_train['same'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['same'] = np.where(df_train['title_and_text'] == df_train['title_and_text_deslanged'] , True, False)


0    False
1     True
3    False
4     True
5    False
Name: same, dtype: bool

In [66]:
df_train['same'].value_counts()

False    360
True     150
Name: same, dtype: int64

## Check regex deslang, correct examples where it is deslanged incorrectly

In [67]:
deslanged = df_train[['title_and_text', 'title_and_text_deslanged']][df_train['same'] == False]

In [82]:
deslanged[:100]

Unnamed: 0,title_and_text,title_and_text_deslanged
0,"Banks should have an automated YOLO portfolio My ""aggressive"" portfolio that my automated investments are softer than the mattress that my wife's boyfriend sleeps on with her. \n\nBanks should have offer a YOLO portfolio for single IQ degenerates like me that go for moonshots. They will look for tickers like GME and AMC and try to buy low and sell high, or even buy some FD calls that will either moon or go lower than the depth of Cramer's basement. \n\nAll the bank needs to do is make full disclosures and educate the ""investor"" that they belong on the shortbus to invest in this, and have a 30 day wait period so that the paper hands can back out. \n\nI'd rather risk losing all my money in this casino than to see my ""aggressive"" portfolio beat S&amp;P 500 by 0.1% in a bull run year.","Banks should have an automated You only live once portfolio My ""aggressive"" portfolio that my automated investments are softer than the mattress that my wife'Sarcasm boyfriend sleeps on with her. \n\nBanks should have offer a You only live once portfolio for single Intelligence quotient degenerates like me that go for moonshots. They will look for tickers like GME and AMC and try to buy low and sell high, or even buy some FD calls that will either moon or go lower than the depth of Cramer'Sarcasm basement. \n\nAll the bank needs to do is make full disclosures and educate the ""investor"" that they belong on the shortbus to invest in this, and have a 30 day wait period so that the paper hands can back out. \n\nI'Di** rather risk losing all my money in this casino than to see my ""aggressive"" portfolio beat S&To be loud and angry;Pitcher 500 by 0.1% in a bull run year."
3,"Stopped trading options for a few months. Put $500 in a WeBull account before GME earnings and 100x'd to $50,000 in 10 days. Can't wait to DRS after exercising these contracts.","Stopped trading options for a few months. Put $500 in a WeBull account before GME earnings and 100x'Di** to $50,000 in 10 days. Can't wait to DRS after exercising these contracts."
5,"last time gme/amc went up, nok followed","last time gme/amc went Underpowered, nok followed"
6,"Rite Aid isn't going Bankrupt. Rite Aid isn't going Bankrupt. They have $150 million in cash on hand. They are generating over $300 million in Free Cash Flow right now and they have another $100 million in Free Cash Flow potential. RAD has a $150 million line of credit. RAD is worth $11 Billion all day everyday. Shorts remain Trapped. Earnings will come and like any company that has been investing in growth and paying down debt. RAD's revenue growth from 2019 directly after they sold there poorest preforming stores to WBA. Until Q4 2022 revenue projected. \n\nRevenue 2019 [$21,639,557]\nRevenue 2020 [$21,928,393]\nRevenue 2021 [$24,043,240]\nRevenue 2022 [$24,419,721]\n\nTotal 4 year increase in revenue. Yes a 15% growth in revenue. \n\n$2,780,164\n\nSo why is RAD's short interest 30% of their float with only 55 million shares outstanding?","Rite Aid isn't going Bankrupt. Rite Aid isn't going Bankrupt. They have $150 million in cash on hand. They are generating over $300 million in Free Cash Flow right now and they have another $100 million in Free Cash Flow potential. RAD has a $150 million line of credit. RAD is worth $11 Billion all day everyday. Shorts remain Trapped. Earnings will come and like any company that has been investing in growth and paying down debt. RAD'Sarcasm revenue growth from 2019 directly after they sold there poorest preforming stores to WBA. Until Fourth quarter 2022 revenue projected. \n\nRevenue 2019 [$21,639,557]\nRevenue 2020 [$21,928,393]\nRevenue 2021 [$24,043,240]\nRevenue 2022 [$24,419,721]\n\nTotal For year increase in revenue. Yes a 15% growth in revenue. \n\n$To,780,164\n\nSo why is RAD'Sarcasm short interest 30% of their float with only 55 million shares outstanding?"
7,Yawn… let me know when it hits 500 I guess . Preferably by Wednesday. YOLO with updated Positions,Yawn… let me know when it hits 500 I guess . Preferably by Wednesday. You only live once with updated Positions
10,Tesla rake on 4/1 1100c. 9.4k -&gt; 26k. Thank you Daddy Elon Musk for pumping my garbage calls with a rogue 6am tweet❤️❤️,Tesla rake on For/1 1100c. Parent in room.4k -&gt; 26k. Thank you Daddy Elon Musk for pumping my garbage calls with a rogue 6am tweet❤️❤️
11,"How are so many of you fucks still in GME/AMC/TSLA Like, when do you get out? After over week straight of really really good green days, when do you decide to get out? \n\nWhy do you put 10, 20, or 50k into weeklies in these stocks when it seems extremely unlikely that they'll keep going up? Like, I have 10k in my account, I made some bucks on AMZN today, but compared to the 10x baggers some of you are making, I made peanuts today. \n\nAre you really that rich that 20k doesn't matter to you but letting it ride in GME weeklies for a chance at 400k does matter? Or are you really working shit pay jobs, and this is your yearly yolo after pinching your pennies? \n\nLike, goddamn.","How are so many of you fucks still in GME/AMC/TSLA Like, when do you get out? After over week straight of really really good green days, when do you decide to get out? \n\nWhy do you put 10, Location, or 50k into weeklies in these stocks when it seems extremely unlikely that they'll keep going Underpowered? Like, I have 10k in my account, I made some bucks on AMZN today, but compared to the Thanks baggers some of you are making, I made peanuts today. \n\nAre you really that rich that 20k doesn't matter to you but letting it ride in GME weeklies for a chance at 400k does matter? Or are you really working shit pay jobs, and this is your yearly yolo after pinching your pennies? \n\nLike, goddamn."
18,"Just another gains porn post to add to the masses. I don't know who that Ken Griffin guy is, but I bet his ass is burning and I couldn't be happier. GME, AMC, BBBY","Just another gains porn post to Address to the masses. I don't know who that Ken Griffin guy is, but I bet his ass is burning and I couldn't be happier. GME, AMC, BBBY"
19,"Built a trading bot using this subreddit as data https://imgur.com/a/Kj01TQG I built a trading bot that used portfolio optimization on 2020 (prior to the GME explosion). I believe the bot performed well, what does this say about wallstreetbets. Does this mean that wall street bets is truly representative of the market? Or is this just a fluke that is a byproduct of my algorithm? I believe that the 2020 data would be representative of the market today, I intentionally didn't include the January 2021, because it may have been a fluke and isn't representative of future data. However, that may not be the case, in that case my bot would perform even better in that context. Another problem is with the stocks chosen in my portfolio, since I picked the most popular stocks out of wallstreetbets, it may lead to some bias as wallstreetbets may have only been talking about the stock because it was growing. Does this type of growth make sense for a bot using wallstreetbets for natural language ...","Built a trading Bottom lane using this subreddit as data https://imgur.com/a/Kj01TQG I built a trading Bottom lane that used portfolio optimization on 2020 (prior to the GME explosion). I believe the Bottom lane performed well, what does this say about wallstreetbets. Does this mean that wall street bets is truly representative of the market? Or is this just a fluke that is a byproduct of my algorithm? I believe that the 2020 data would be representative of the market today, I intentionally didn't include the January 2021, because it may have been a fluke and isn't representative of future data. However, that may not be the case, in that case my Bottom lane would perform even better in that context. Another problem is with the stocks chosen in my portfolio, since I picked the most popular stocks out of wallstreetbets, it may lead to some bias as wallstreetbets may have only been talking about the stock because it was growing. Does this type of growth make sense for a Bottom lane us..."
24,"$GME calls. $500 to $20,000 in a week.","$GME calls. $500 to $Location,000 in a week."


## T5 Translation

In [None]:
import logging

import pandas as pd
from simpletransformers.t5 import T5Model, T5Args

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


train_data = [
    ["binary classification", "Anakin was Luke's father" , "1"],
    ["binary classification", "Luke was a Sith Lord" , "0"],
    ["generate question", "Star Wars is an American epic space-opera media franchise created by George Lucas, which began with the eponymous 1977 film and quickly became a worldwide pop-culture phenomenon", "Who created the Star Wars franchise?"],
    ["generate question", "Anakin was Luke's father" , "Who was Luke's father?"],
]
train_df = pd.DataFrame(train_data)
train_df.columns = ["prefix", "input_text", "target_text"]

eval_data = [
    ["binary classification", "Leia was Luke's sister" , "1"],
    ["binary classification", "Han was a Sith Lord" , "0"],
    ["generate question", "In 2020, the Star Wars franchise's total value was estimated at US$70 billion, and it is currently the fifth-highest-grossing media franchise of all time.", "What is the total value of the Star Wars franchise?"],
    ["generate question", "Leia was Luke's sister" , "Who was Luke's sister?"],
]
eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["prefix", "input_text", "target_text"]

model_args = T5Args()
model_args.num_train_epochs = 200
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

model = T5Model("t5", "t5-base", args=model_args, use_cuda=False)


def count_matches(labels, preds):
    print(labels)
    print(preds)
    return sum([1 if label == pred else 0 for label, pred in zip(labels, preds)])


model.train_model(train_df, eval_data=eval_df, matches=count_matches)

print(model.eval_model(eval_df, matches=count_matches))

INFO:filelock:Lock 140060154113184 acquired on /home/apschlissel/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637.lock


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

INFO:filelock:Lock 140060154113184 released on /home/apschlissel/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637.lock
INFO:filelock:Lock 140060154113568 acquired on /home/apschlissel/.cache/huggingface/transformers/ab4e948915b067f5cb6e5105f6f85044fd717b133f43240db67899a8fc7b29a2.26934c75adf19ceac3c268b721ba353356b7609c45f5627550326f275a2163b4.lock


Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:filelock:Lock 140060154113568 released on /home/apschlissel/.cache/huggingface/transformers/ab4e948915b067f5cb6e5105f6f85044fd717b133f43240db67899a8fc7b29a2.26934c75adf19ceac3c268b721ba353356b7609c45f5627550326f275a2163b4.lock
INFO:filelock:Lock 140060153121904 acquired on /home/apschlissel/.cache/huggingface/transformers/684a47ca6257e4ca71f0037771464c5b323e945fbc58697d2fad8a7dd1a2f8ba.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d.lock


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

INFO:filelock:Lock 140060153121904 released on /home/apschlissel/.cache/huggingface/transformers/684a47ca6257e4ca71f0037771464c5b323e945fbc58697d2fad8a7dd1a2f8ba.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d.lock
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/4 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

INFO:simpletransformers.t5.t5_utils: Saving features into cached file cache_dir/t5-base_cached_1284
INFO:simpletransformers.t5.t5_model: Training started


Epoch:   0%|          | 0/200 [00:00<?, ?it/s]

Running Epoch 0 of 200:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#need to run on colab
#download deslanged data & use T5 code above on data