# Obtaining basic statistics for textual lengths

In [22]:
import pandas as pd
import yfinance as yf
import talib as ta
import nltk 
import spacy
import matplotlib.pyplot as plt
import plotly.express as px

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nlp = spacy.load('en_core_web_sm')



ModuleNotFoundError: No module named 'talib'

In [12]:
df = pd.read_csv("/home/yadasa/Desktop/Finance-Data-Analysis/data/raw_analyst_ratings.csv",)

In [None]:
df.head(10)

In [14]:

# Calculate the length of each headline and create a new column
df['headline_length'] = df['headline'].apply(lambda x: len(x))

# Basic statistics for headline length
headline_stats = df['headline_length'].describe()

# Print the statistics
print("\nStatistics of headlines")

headline_stats



Statistics of headlines


count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64

In [15]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='ISO8601')

# Extract relevant date components for analysis
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday  
# Monday is 0, Sunday is 6

# Analyze trends over time, such as news frequency by year, month, day, and weekday
news_frequency_by_year = df['year'].value_counts().sort_index()
news_frequency_by_month = df.groupby('month')['headline'].count()
news_frequency_by_day = df.groupby('day')['headline'].count()
news_frequency_by_weekday = df.groupby('weekday')['headline'].count()

# Print descriptive output
print("News frequency by year:")
print(news_frequency_by_year)
print("\nExplanation: This shows the count of news articles published each year.")

print("\nNews frequency by month:")
print(news_frequency_by_month)
print("\nExplanation: This shows the count of news articles published for each month of the year.")

print("\nNews frequency by day:")
print(news_frequency_by_day)
print("\nExplanation: This shows the count of news articles published for each day of the month.")

print("\nNews frequency by weekday:")
print(news_frequency_by_weekday)
print("\nExplanation: This shows the count of news articles published for each day of the week (Monday to Sunday). Monday is represented as 0, Sunday as 6.")


News frequency by year:
year
2009     11489
2010     81319
2011    131322
2012    122655
2013    121523
2014    134859
2015    135295
2016    141892
2017    124456
2018    146924
2019    150380
2020    105214
Name: count, dtype: int64

Explanation: This shows the count of news articles published each year.

News frequency by month:
month
1     121541
2     122835
3     121948
4     121815
5     130339
6     106600
7     110762
8     124042
9      96087
10    124800
11    121431
12    105128
Name: headline, dtype: int64

Explanation: This shows the count of news articles published for each month of the year.

News frequency by day:
day
1     46573
2     46905
3     45782
4     45358
5     47123
6     50039
7     50073
8     50682
9     48692
10    47793
11    46064
12    48610
13    47769
14    46714
15    44633
16    44206
17    44610
18    43857
19    44918
20    46435
21    44597
22    43161
23    46515
24    44040
25    43007
26    45571
27    47574
28    45049
29    41787
30    420

In [16]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score for each headline
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment

# Apply the function to each headline and store the results in a new column
df['sentiment'] = df['headline'].apply(get_sentiment_score)

# Define a function to classify the sentiment based on the compound score
def classify_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to the compound score and store the results in a new column
df['sentiment_class'] = df['sentiment'].apply(lambda x: classify_sentiment(x['compound']))

# Count the number of headlines per sentiment class
sentiment_counts = df['sentiment_class'].value_counts()

# Print the sentiment counts in a more descriptive format
print("Sentiment Analysis Results:")
print("-------------------------")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count} headlines")



Sentiment Analysis Results:
-------------------------
Neutral: 741194 headlines
Positive: 441858 headlines
Negative: 224276 headlines


In [17]:
import spacy
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Load the English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000  # Increase max_length to handle longer texts

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Set chunk size
chunk_size = 100000  # Adjust as needed

# Function to process text in chunks
def process_text_chunks(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Filter out stop words and punctuation
    filtered_tokens = [word for word in tokens if word.isalnum() and word.lower() not in stop_words]

    # Perform part-of-speech tagging to extract nouns and proper nouns
    pos_tags = pos_tag(filtered_tokens)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]

    # Use spaCy for named entity recognition (NER)
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Filter entities to extract only significant ones
    significant_entities = [entity[0] for entity in entities if entity[1] in ['ORG', 'PERSON', 'GPE', 'EVENT']]

    # Combine both noun phrases and significant named entities
    keywords = nouns + significant_entities

    return keywords

# Concatenate all headlines into a single string
all_headlines = ' '.join(df['headline'])

# Process the text in chunks
chunks = [all_headlines[i:i+chunk_size] for i in range(0, len(all_headlines), chunk_size)]

# Initialize a list to store keywords from all chunks
all_keywords = []

# Process each chunk separately
for chunk in chunks:
    keywords = process_text_chunks(chunk)
    all_keywords.extend(keywords)

# Calculate the frequency distribution of keywords
fdist = FreqDist(all_keywords)

# Get the most common keywords
common_keywords = fdist.most_common(10)

# Print the most common keywords
print("Most common keywords or phrases:")
for keyword, frequency in common_keywords:
    print(f"{keyword}: {frequency}")


Most common keywords or phrases:
Stocks: 157529
EPS: 136718
Est: 120858
Reports: 106667
vs: 104958
Benzinga: 92032
Shares: 91631
Earnings: 84910
Market: 83721
Top: 75682


In [23]:
# Convert the 'date' column to datetime format
f['date'] = pd.to_datetime(df['date'], format='ISO8601')

# Set the 'date' column as the index
df.set_index('date', inplace=True)

# Resample the data to get the count of articles per day
article_count_per_day = df.resample('D').size()

# Plot the publication frequency over time
plt.figure(figsize=(12, 6))
article_count_per_day.plot()
plt.title('Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles Published')
plt.grid(True)
plt.show()


KeyError: 'date'

 # Analysis of publishing times might reveal if there’s a specific time when most news is released, which could be crucial for traders and automated trading systems

In [19]:
# with a column named 'date' containing the publication dates.

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Extract the hour of the day from the 'date' column
df['hour'] = df['date'].dt.hour

# Count the number of articles published during each hour
article_count_by_hour = df.groupby('hour').size()

# Plot the distribution of publishing times
plt.figure(figsize=(10, 6))
article_count_by_hour.plot(kind='bar')
plt.title('Distribution of Publishing Times')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Articles Published')
plt.xticks(range(24), rotation=0)
plt.grid(axis='y')
plt.show()


KeyError: 'date'

# Which publishers contribute most to the news feed? Is there a difference in the type of news they report?

# To answer the above two question, we use 
# 1)findind number of article publishe by each publisher
# 2)Number of stop words in each publishing

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Count the number of articles published by each publisher
publisher_counts = df['publisher'].value_counts()

# Sort publishers by the number of articles published (in ascending order)
sorted_publishers = publisher_counts.sort_values().index

# Filter out stop words
stop_words = set(stopwords.words('english'))

# Tokenize and process the headlines to identify common keywords
keywords_by_publisher = {}
for publisher in sorted_publishers:
    publisher_data = df[df['publisher'] == publisher]
    all_headlines = ' '.join(publisher_data['headline'])
    tokens = word_tokenize(all_headlines)
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    fdist = FreqDist(filtered_tokens)
    keywords_by_publisher[publisher] = fdist.most_common(10)  # Get top 10 keywords for each publisher

# Print the number of articles published by each publisher
print("Number of articles published by each publisher (in ascending order):")
print(publisher_counts)
print("\n")

# Print the top keywords associated with each publisher
print("Top keywords associated with each publisher:")
for publisher in sorted_publishers:
    print(f"Publisher: {publisher}")
    keywords = keywords_by_publisher.get(publisher, [])
    for keyword, frequency in keywords:
        print(f"{keyword}: {frequency}")
    print("\n")


# What if the publisher is an Orginization which contributes most to news?
# if eamil is used as publisher name we can extract unique domains

In [None]:
# Extract domain from email addresses
df['domain'] = df['publisher'].str.extract(r'@(.*)')

# Count the number of articles published by each domain
domain_counts = df['domain'].value_counts()

# Print the unique domains and their corresponding publication counts
print("Unique domains and their publication counts:")
print(domain_counts)

# Find the domain with the highest number of articles published
most_contributor_domain = domain_counts.idxmax()
articles_published = domain_counts.max()

# Print the most contributing domain
print(f"The most contributing domain is '{most_contributor_domain}' with {articles_published} articles published.")


# The most contributing domain is 'benzinga.com' with 7937 articles published.

In [None]:
import pandas as pd
import yfinance as yf

# Load the dataset containing stock symbols and dates from a CSV file into a pandas DataFrame

# Convert the 'date' column to datetime objects to facilitate date manipulation
df['date'] = pd.to_datetime(df['date'])

# Determine date ranges and fetch stock data
date_ranges = df.groupby('symbol').agg({'date': ['min', 'max']})

# Function to fetch stock data for a given symbol and date range
def fetch_stock_data(symbol, start_date, end_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.download(symbol, start=start_date, end=end_date)
        return stock_data
    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return None

# Iterate through each symbol and fetch stock data based on the calculated date ranges
for symbol, (start_date, end_date) in date_ranges.iterrows():
    print(f"Fetching data for {symbol} from {start_date} to {end_date}")
    stock_data = fetch_stock_data(symbol, start_date, end_date)
    stock_data
    
    # Process fetched data (you can save it, manipulate it, etc.)
    if stock_data is not None:
        # Perform operations on the fetched data here
        pass
