# Imports

In [50]:
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#!pip install vaderSentiment



# Data cleansing

In [51]:
#Link to the dataset https://www.kaggle.com/datasets/injek0626/reddit-stock-related-posts
data = pd.read_csv("datasets/Reddit Post/posts.csv")

In [52]:
df1 = data

In [53]:
s = df1['selftext']
total = len(s)

# proportion of NaN
prop_na = s.isna().mean()

# proportion of “[deleted]”
prop_deleted = (s == '[deleted]').mean()

# proportion of “[removed]”
prop_removed = (s == '[removed]').mean()

# proportion of any of the three
prop_any = (s.isna() | s.isin(['[deleted]', '[removed]'])).mean()

print(f"NaN:       {prop_na:.2%}")
print(f"Deleted:   {prop_deleted:.2%}")
print(f"Removed:   {prop_removed:.2%}")
print(f"Any three: {prop_any:.2%}")


NaN:       31.77%
Deleted:   11.66%
Removed:   22.05%
Any three: 65.49%


In [64]:
df_clean = df1[~(s.isna() | s.isin(['[deleted]', '[removed]']))]

In [69]:
df_clean.loc[:,'created_utc'] = pd.to_datetime(df_clean['created_utc'], unit='s')

In [56]:
# pattern = r'\b(aapl|apple)\b'

# combined = df1['title'].fillna('') + ' ' + df1['selftext'].fillna('')
# mask = combined.str.contains(pattern, case=False, regex=True)

# df_aapl = df1[mask]


In [57]:
# pattern = r'\b(msft|microsoft)\b'

# combined = df1['title'].fillna('') + ' ' + df1['selftext'].fillna('')
# mask = combined.str.contains(pattern, case=False, regex=True)

# df_msft = df1[mask]

# Sentiment analysis

In [71]:
analyzer = SentimentIntensityAnalyzer()

In [72]:
def get_sentiment(text):
    if pd.isna(text):
        return None
    return analyzer.polarity_scores(text)['compound']


In [73]:
df_clean['sentiment'] = df_clean['selftext'].apply(get_sentiment)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['sentiment'] = df_clean['selftext'].apply(get_sentiment)


In [75]:
df_clean.to_csv('clean_data.csv', index=False)


In [76]:
def label_sentiment(score):
    if pd.isna(score):
        return 'neutral'
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df_clean.loc[:, 'sentiment_label'] = df_clean['sentiment'].apply(label_sentiment)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.loc[:, 'sentiment_label'] = df_clean['sentiment'].apply(label_sentiment)


In [78]:
df_clean.to_csv('clean_data_sentiment_label.csv', index=False)

In [79]:
df_final = df_clean[['created_utc', 'title', 'selftext', 'sentiment', 'sentiment_label']]


In [81]:
df_final.to_csv('final_dataset.csv', index=False)