# Obtaining basic statistics for textual lengths

In [1]:
import pandas as pd

In [2]:
import nltk 

In [3]:
import spacy

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/yadasa/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:

from nltk.tokenize import word_tokenize

In [6]:
from nltk.corpus import stopwords

In [7]:
from nltk.probability import FreqDist

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yadasa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
df = pd.read_csv("/home/yadasa/Desktop/Finance-Data-Analysis/data/raw_analyst_ratings.csv",)

In [11]:

# Calculate the length of each headline and create a new column
df['headline_length'] = df['headline'].apply(lambda x: len(x))

# Basic statistics for headline length
headline_stats = df['headline_length'].describe()

# Print the statistics
print("\nStatistics of headlines")

headline_stats



Statistics of headlines


count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64

In [12]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='ISO8601')

# Extract relevant date components for analysis
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday  
# Monday is 0, Sunday is 6

# Analyze trends over time, such as news frequency by year, month, day, and weekday
news_frequency_by_year = df['year'].value_counts().sort_index()
news_frequency_by_month = df.groupby('month')['headline'].count()
news_frequency_by_day = df.groupby('day')['headline'].count()
news_frequency_by_weekday = df.groupby('weekday')['headline'].count()

# Print descriptive output
print("News frequency by year:")
print(news_frequency_by_year)
print("\nExplanation: This shows the count of news articles published each year.")

print("\nNews frequency by month:")
print(news_frequency_by_month)
print("\nExplanation: This shows the count of news articles published for each month of the year.")

print("\nNews frequency by day:")
print(news_frequency_by_day)
print("\nExplanation: This shows the count of news articles published for each day of the month.")

print("\nNews frequency by weekday:")
print(news_frequency_by_weekday)
print("\nExplanation: This shows the count of news articles published for each day of the week (Monday to Sunday). Monday is represented as 0, Sunday as 6.")


News frequency by year:
year
2009     11489
2010     81319
2011    131322
2012    122655
2013    121523
2014    134859
2015    135295
2016    141892
2017    124456
2018    146924
2019    150380
2020    105214
Name: count, dtype: int64

Explanation: This shows the count of news articles published each year.

News frequency by month:
month
1     121541
2     122835
3     121948
4     121815
5     130339
6     106600
7     110762
8     124042
9      96087
10    124800
11    121431
12    105128
Name: headline, dtype: int64

Explanation: This shows the count of news articles published for each month of the year.

News frequency by day:
day
1     46573
2     46905
3     45782
4     45358
5     47123
6     50039
7     50073
8     50682
9     48692
10    47793
11    46064
12    48610
13    47769
14    46714
15    44633
16    44206
17    44610
18    43857
19    44918
20    46435
21    44597
22    43161
23    46515
24    44040
25    43007
26    45571
27    47574
28    45049
29    41787
30    420

In [13]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score for each headline
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment

# Apply the function to each headline and store the results in a new column
df['sentiment'] = df['headline'].apply(get_sentiment_score)

# Define a function to classify the sentiment based on the compound score
def classify_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to the compound score and store the results in a new column
df['sentiment_class'] = df['sentiment'].apply(lambda x: classify_sentiment(x['compound']))

# Count the number of headlines per sentiment class
sentiment_counts = df['sentiment_class'].value_counts()

# Print the sentiment counts in a more descriptive format
print("Sentiment Analysis Results:")
print("-------------------------")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count} headlines")



Sentiment Analysis Results:
-------------------------
Neutral: 741194 headlines
Positive: 441858 headlines
Negative: 224276 headlines


In [14]:

# Load the English tokenizer, tagger, parser, NER, and stop words
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# with a column named 'headline' containing the headlines.

# Concatenate all headlines into a single string
all_headlines = ' '.join(df['headline'])

# Tokenize the text
tokens = word_tokenize(all_headlines)

# Filter out stop words and punctuation
filtered_tokens = [word for word in tokens if word.isalnum() and word.lower() not in stop_words]

# Perform part-of-speech tagging to extract nouns and proper nouns
pos_tags = nltk.pos_tag(filtered_tokens)
nouns = [word for word, pos in pos_tags if pos.startswith('NN')]

# Use spaCy for named entity recognition (NER)
doc = nlp(all_headlines)
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Filter entities to extract only significant ones
significant_entities = [entity[0] for entity in entities if entity[1] in ['ORG', 'PERSON', 'GPE', 'EVENT']]

# Combine both noun phrases and significant named entities
keywords = nouns + significant_entities

# Calculate the frequency distribution of keywords
fdist = FreqDist(keywords)

# Get the most common keywords
common_keywords = fdist.most_common(10)

# Print the most common keywords
print("Most common keywords or phrases:")
for keyword, frequency in common_keywords:
    print(f"{keyword}: {frequency}")


ValueError: [E088] Text of length 104311875 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.