In [None]:
# downloading installs
import os
from dotenv import load_dotenv
from pynytimes import NYTAPI
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import numpy as np
import glob

In [None]:
# creating the dotcom bubble corpus

# specifying the path to csv files
path = "/Users/alexszabo/Project_2/Sentiment_Data/DCB"

# csv files in the path
files = glob.glob(path + "/*.csv")

# defining an empty list to store
# content
data_frame = pd.DataFrame()
content = []

# checking all the csv files in the
# specified path
for filename in files:

# reading content of csv file
    # content.append(filename)
    df = pd.read_csv(filename, index_col=None)
    content.append(df)

# converting content to data frame
dcb_corpus = pd.concat(content)

In [None]:
# cleaning the dcb data

# putting the dataframe in chronological order
dcb_corpus.sort_values(by=['pub_date'], inplace=True)

# replacing columns with only a period with a nan value
dcb_corpus = dcb_corpus.replace('.',np.nan).dropna(axis = 0, how = 'any')

# dropping columns that have no text in 'snippet'
dcb_corpus.dropna(subset=['snippet'], inplace=True)

# resetting the index
dcb_corpus = dcb_corpus.reset_index(drop=True)


In [None]:
# creating the 2008 crash corpus

# specifying the path to csv files
path = "/Users/alexszabo/Project_2/Sentiment_Data/CRSH"

# csv files in the path
files = glob.glob(path + "/*.csv")

# defining an empty list to store
# content
data_frame = pd.DataFrame()
content = []

# checking all the csv files in the
# specified path
for filename in files:

# reading content of csv file
    # content.append(filename)
    df = pd.read_csv(filename, index_col=None)
    content.append(df)

# converting content to data frame
crsh_corpus = pd.concat(content)

In [None]:
# cleaning the crsh data

# putting the dataframe in chronological order
crsh_corpus.sort_values(by=['pub_date'], inplace=True)

# replacing columns with only a period with a nan value
crsh_corpus = crsh_corpus.replace('.',np.nan).dropna(axis = 0, how = 'any')

# dropping columns that have no text in 'snippet'
crsh_corpus.dropna(subset=['snippet'], inplace=True)

# resetting the index
crsh_corpus = crsh_corpus.reset_index(drop=True)

In [None]:
# creating the cvd corpus

# specifying the path to csv files
path = "/Users/alexszabo/Project_2/Sentiment_Data/CVD"

# csv files in the path
files = glob.glob(path + "/*.csv")

# defining an empty list to store
# content
data_frame = pd.DataFrame()
content = []

# checking all the csv files in the
# specified path
for filename in files:

# reading content of csv file
    # content.append(filename)
    df = pd.read_csv(filename, index_col=None)
    content.append(df)

# converting content to data frame
cvd_corpus = pd.concat(content)

In [None]:
# cleaning the cvd data

# putting the dataframe in chronological order
cvd_corpus.sort_values(by=['pub_date'], inplace=True)

# replacing columns with only a period with a nan value
cvd_corpus = cvd_corpus.replace('.',np.nan).dropna(axis = 0, how = 'any')

# dropping columns that have no text in 'snippet'
cvd_corpus.dropna(subset=['snippet'], inplace=True)

# resetting the index
cvd_corpus = cvd_corpus.reset_index(drop=True)

In [None]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alexszabo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# Create the ticker sentiment scores DataFrame

market_sentiments = []

for column, row in dcb_corpus.iterrows():
    try:
        text = row["snippet"]
        date = row["pub_date"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        market_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound

        })

    except AttributeError:
        pass

# Create DataFrame
dcb_sentiment = pd.DataFrame(market_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound"]
dcb_sentiment = dcb_sentiment[cols]

# turning the date column into datetime and altering the format
dcb_sentiment['date'] = pd.to_datetime(dcb_sentiment['date'])

dcb_sentiment.to_csv('/Users/alexszabo/Project_2/Sentiment_Scores/VADER/dcb_VADER_sentiment.csv', index=False)

In [None]:
# Create the ticker sentiment scores DataFrame

market_sentiments = []

for column, row in crsh_corpus.iterrows():
    try:
        text = row["snippet"]
        date = row["pub_date"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        market_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound

        })

    except AttributeError:
        pass

# Create DataFrame
crsh_sentiment = pd.DataFrame(market_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound"]
crsh_sentiment = crsh_sentiment[cols]


# turning the date column into datetime and altering the format
crsh_sentiment['date'] = pd.to_datetime(crsh_sentiment['date'])

# exporting the data to a csv
crsh_sentiment.to_csv('/Users/alexszabo/Project_2/Sentiment_Scores/VADER/crsh_VADER_sentiment.csv', index=False)

In [None]:
# Create the ticker sentiment scores DataFrame

market_sentiments = []

for column, row in cvd_corpus.iterrows():
    try:
        text = row["snippet"]
        date = row["pub_date"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        market_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound

        })

    except AttributeError:
        pass

# Create DataFrame
cvd_sentiment = pd.DataFrame(market_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound"]
cvd_sentiment = cvd_sentiment[cols]


# turning the date column into datetime and altering the format
cvd_sentiment['date'] = pd.to_datetime(cvd_sentiment['date'])


# exporting the data to a csv file

cvd_sentiment.to_csv('/Users/alexszabo/Project_2/Sentiment_Scores/VADER/cvd_VADER_sentiment.csv', index=False)