In [1]:
import pandas as pd
import flair

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = flair.models.TextClassifier.load('en-sentiment')

In [5]:
#function to get sentiment from the reddit dataset using flair's sentiment model
def get_sentiment(text):
    sentence = flair.data.Sentence(text)
    model.predict(sentence)
    sentiment = sentence.labels[0].score
    return sentiment

In [6]:
#read reddit data that contains organization info
df = pd.read_csv('reddit_investing_ner.csv', sep='|')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           804 non-null    object 
 1   created_utc    804 non-null    float64
 2   subreddit      804 non-null    object 
 3   title          804 non-null    object 
 4   selftext       804 non-null    object 
 5   upvote_ratio   804 non-null    float64
 6   ups            804 non-null    int64  
 7   downs          804 non-null    int64  
 8   score          804 non-null    int64  
 9   organizations  804 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 62.9+ KB


In [7]:
#apply sentiment analysis function on reddit threads
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations,sentiment
0,t3_13qkfat,1684932000.0,investing,Exploration of the Arbitrage Co-movement Effec...,\n\nThere is very little doubt that exchange-...,0.5,0,0,0,[],0.927749
1,t3_13qiqm6,1684927000.0,investing,What’s the best artificial intelligence ETF to...,"Hi,\n\ni want to add an AI ETF to my portfolio...",0.2,0,0,0,[],0.823243
2,t3_13qh6aq,1684922000.0,investing,When will the US debt become a real issue?,"Hello all, \n\nI noticed the USD 31 trillion ...",0.5,0,0,0,[],0.509022
3,t3_13qh3p2,1684922000.0,investing,I just put 50k on a roboadvisor,I am a (27m) very average basic investor with ...,0.6,1,0,1,[],0.99801
4,t3_13qgtpy,1684921000.0,investing,"The Impact of Financial Crisis, Behavioral Bia...",In a hypothetical scenario where the financial...,0.22,0,0,0,[],0.988111


In [9]:
import ast

In [10]:
df['organizations'] = df['organizations'].apply(lambda x: ast.literal_eval(x))

In [27]:
#extarct the sentiment score for each organization and store in a dict
sentiment = {}
direction = []
for i, row in df.iterrows():
    score = row['sentiment']
    if score <=0.5:
        direction = 'POSITIVE'
    else:
        direction = 'NEGATIVE'
    for org in row['organizations']:
        if org not in sentiment.keys():
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        sentiment[org][direction].append(score)

In [28]:
sentiment

{'SPY': {'POSITIVE': [],
  'NEGATIVE': [0.687796950340271,
   0.9998940229415894,
   0.9999594688415527,
   0.9985275268554688,
   0.9998131394386292]},
 'the Treasury Department': {'POSITIVE': [], 'NEGATIVE': [0.9709482192993164]},
 'Title': {'POSITIVE': [], 'NEGATIVE': [0.9999594688415527]},
 'VMRXX': {'POSITIVE': [], 'NEGATIVE': [0.8877052664756775]},
 'ETF': {'POSITIVE': [],
  'NEGATIVE': [0.9689712524414062,
   0.9991532564163208,
   0.9870947003364563,
   0.9740957617759705]},
 'Dept of Treasury': {'POSITIVE': [], 'NEGATIVE': [0.999907374382019]},
 'Coke': {'POSITIVE': [], 'NEGATIVE': [0.868907630443573]},
 'YouTube-video](https://www.youtube.com': {'POSITIVE': [],
  'NEGATIVE': [0.5230844616889954]},
 'PYPL': {'POSITIVE': [], 'NEGATIVE': [0.996052622795105]},
 'Apple': {'POSITIVE': [],
  'NEGATIVE': [0.9992120265960693, 0.9987714886665344]},
 'Minneapolis Fed': {'POSITIVE': [], 'NEGATIVE': [0.9995951056480408]},
 'the Frankfurt School of Finance and Management': {'POSITIVE': [],

In [29]:
sentiment['Nasdaq']

{'POSITIVE': [], 'NEGATIVE': [0.9750022292137146, 0.9993575215339661]}

In [31]:
#get average total score, avg positive/negative sentiment per organization
avg_sentiment = []

for org in sentiment.keys():
    pos_freq = len(sentiment[org]['POSITIVE'])
    neg_freq = len(sentiment[org]['NEGATIVE'])
    for direction in ['POSITIVE', 'NEGATIVE']:
        score = sentiment[org][direction]
        if len(score) == 0:
            sentiment[org][direction] = 0.0
        else:
            sentiment[org][direction] = sum(score)
    total = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    avg = total/(pos_freq+neg_freq)
    pos_avg = sentiment[org]['POSITIVE'] / pos_freq if pos_freq != 0 else 0
    neg_avg = sentiment[org]['NEGATIVE'] / neg_freq if neg_freq != 0 else 0
    avg_sentiment.append({
        'entity': org,
        'postive': pos_avg,
        'negative': neg_avg,
        'frequency': pos_freq+neg_freq,
        'score': avg
    })

In [33]:
avg_sentiment

[{'entity': 'SPY',
  'postive': 0,
  'negative': 0.9371982216835022,
  'frequency': 5,
  'score': -0.9371982216835022},
 {'entity': 'the Treasury Department',
  'postive': 0,
  'negative': 0.9709482192993164,
  'frequency': 1,
  'score': -0.9709482192993164},
 {'entity': 'Title',
  'postive': 0,
  'negative': 0.9999594688415527,
  'frequency': 1,
  'score': -0.9999594688415527},
 {'entity': 'VMRXX',
  'postive': 0,
  'negative': 0.8877052664756775,
  'frequency': 1,
  'score': -0.8877052664756775},
 {'entity': 'ETF',
  'postive': 0,
  'negative': 0.9823287427425385,
  'frequency': 4,
  'score': -0.9823287427425385},
 {'entity': 'Dept of Treasury',
  'postive': 0,
  'negative': 0.999907374382019,
  'frequency': 1,
  'score': -0.999907374382019},
 {'entity': 'Coke',
  'postive': 0,
  'negative': 0.868907630443573,
  'frequency': 1,
  'score': -0.868907630443573},
 {'entity': 'YouTube-video](https://www.youtube.com',
  'postive': 0,
  'negative': 0.5230844616889954,
  'frequency': 1,
  's

In [44]:
#convert this into a df
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.head()

Unnamed: 0,entity,postive,negative,frequency,score
0,SPY,0,0.937198,5,-0.937198
1,the Treasury Department,0,0.970948,1,-0.970948
2,Title,0,0.999959,1,-0.999959
3,VMRXX,0,0.887705,1,-0.887705
4,ETF,0,0.982329,4,-0.982329


In [46]:
sentiment_df.sort_values('score', ascending = False).head(10)

Unnamed: 0,entity,postive,negative,frequency,score
44,TBill,0,0.516542,1,-0.516542
7,YouTube-video](https://www.youtube.com,0,0.523084,1,-0.523084
42,DIS,0,0.523847,1,-0.523847
32,Microsoft,0,0.525636,1,-0.525636
36,RoboForex,0,0.539639,1,-0.539639
90,Business News Channels,0,0.573321,1,-0.573321
96,LPL Financial,0,0.595666,1,-0.595666
141,Augusta Precious Metals,0,0.621383,1,-0.621383
111,Dow Jones,0,0.639209,1,-0.639209
108,Nintendo,0,0.663114,1,-0.663114


In [41]:
#only store sores for orgs where frequency is > 3
#sentiment_df = sentiment_df[sentiment_df['frequency'] > 2]
sentiment_df.head()

Unnamed: 0,entity,postive,negative,frequency,score
0,SPY,0,0.937198,5,-0.937198
4,ETF,0,0.982329,4,-0.982329
12,Fidelity,0,0.907109,5,-0.907109
16,VOO,0,0.960915,5,-0.960915
26,Capcom,0,0.994123,3,-0.994123


In [42]:
sentiment_df.sort_values('score').head(10)

Unnamed: 0,entity,postive,negative,frequency,score
50,MSFT,0,0.999371,3,-0.999371
65,SPAXX,0,0.997313,3,-0.997313
26,Capcom,0,0.994123,3,-0.994123
43,Vanguard,0,0.983083,6,-0.983083
4,ETF,0,0.982329,4,-0.982329
16,VOO,0,0.960915,5,-0.960915
0,SPY,0,0.937198,5,-0.937198
12,Fidelity,0,0.907109,5,-0.907109
