## Text Analysis(Sentiment analysis & Topic Modeling)

Import the necessary libraries

In [17]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import spacy
from collections import Counter
import string

Load the CSV file into a pandas DataFrame

In [3]:
file_path = '../data/raw_analyst_ratings.csv'
df = pd.read_csv(file_path)

### Sentiment Analysis

Define a function to perform sentiment analysis on headlines

In [4]:
def get_sentiment(text):
    analysis = TextBlob(str(text))
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

Apply the sentiment analysis function to the 'headline' column

In [5]:
df['sentiment'] = df['headline'].apply(get_sentiment)

Print or visualize the results

### Keyword and Phrase Identification

Define functions for text preprocessing

In [22]:
nlp = spacy.load("en_core_web_sm")

def get_keywords(headline):
    doc = nlp(headline)
    keywords = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return Counter(keywords)

def process_data(df):
    return df['headline'].tolist()

headlines = process_data(df)

all_keywords = Counter()

for headline in headlines:
    keywords = get_keywords(headline)
    all_keywords += keywords

# Print all keywords with their frequency of occurrence
for keyword, frequency in all_keywords.items():
    print(f"{keyword}: {frequency}")

# Plotting the most common keywords for all headlines
plt.figure(figsize=(10, 6))
plt.bar(all_keywords.keys(), all_keywords.values())
plt.title('Most Common Keywords in All Headlines')
plt.xlabel('Keyword')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()