<a href="https://colab.research.google.com/github/akash1629/NLP_Powered_Competitive_Market_Intelligence/blob/main/NLP_Powered_Competitive_Market_Intelligence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ===========================
#  NLP-Powered Competitive Market Intelligence
# ===========================

# ==========================================
#  1. SETUP: Install Dependencies in Colab
# ==========================================
# Uncomment the lines below if you're running in Colab and need to install them:
# !pip install spacy transformers plotly==5.15.0
# !python -m spacy download en_core_web_sm

# ==========================================
#  2. IMPORTS & GLOBAL SETTINGS
# ==========================================
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import spacy
from transformers import pipeline
import random
import datetime
import warnings
warnings.filterwarnings("ignore")

# Load a small English model for spaCy
nlp = spacy.load("en_core_web_sm")

# Initialize a sentiment analysis pipeline from Hugging Face
sentiment_pipeline = pipeline("sentiment-analysis")

# ==========================================
#  3. SYNTHETIC DATA GENERATION
# ==========================================
# We will generate:
#   1) competitor press releases
#   2) competitor reviews
#   3) competitor financial statements (mock text)

# Let's define some competitor names:
competitors = ["CompA", "CompB", "CompC"]

# We'll create a date range for ~10 press releases or statements
num_docs = 10
start_date = datetime.date(2023, 1, 1)
dates = [start_date + datetime.timedelta(days=i*7) for i in range(num_docs)]  # Weekly press releases

press_releases = []
for i in range(num_docs):
    competitor = random.choice(competitors)
    date_issued = dates[i]
    # Synthetic text with some positive/negative cues
    # We'll keep it short for demonstration
    base_text = (
        f"{competitor} announced a new product line today. The market response has been "
        f"{random.choice(['positive', 'neutral', 'slightly negative', 'extremely positive'])}. "
        f"Financial analysts expect improvements in quarterly revenue."
    )
    press_releases.append((competitor, str(date_issued), "press_release", base_text))

reviews = []
for i in range(num_docs):
    competitor = random.choice(competitors)
    date_issued = dates[i]
    # Synthetic user feedback
    feedback_text = (
        f"Customers of {competitor} reported "
        f"{random.choice(['excellent', 'average', 'poor'])} satisfaction with recent services. "
        f"Online reviews mention {random.choice(['fast shipping', 'delayed deliveries', 'great support'])}."
    )
    reviews.append((competitor, str(date_issued), "review", feedback_text))

financials = []
for i in range(num_docs):
    competitor = random.choice(competitors)
    date_issued = dates[i]
    # Mock financial statement snippet
    finance_text = (
        f"{competitor} declared quarterly earnings of {random.randint(50, 200)} million USD, "
        f"with {random.choice(['increased marketing spend', 'cost-cutting measures', 'R&D investment'])} "
        f"highlighted as a key factor."
    )
    financials.append((competitor, str(date_issued), "financial_statement", finance_text))

# Combine into one DataFrame
all_docs = press_releases + reviews + financials
df_competitor = pd.DataFrame(all_docs, columns=["competitor", "date", "doc_type", "content"])
df_competitor["date"] = pd.to_datetime(df_competitor["date"])

# Optional: sort by date
df_competitor.sort_values(by=["date","competitor"], inplace=True)

print("Sample competitor data:")
print(df_competitor.head(10))

# ==========================================
#  4. NLP PIPELINE: spaCy + Hugging Face
# ==========================================
# We'll do entity recognition (NER) with spaCy and sentiment analysis with Hugging Face.

def analyze_text_spacy(text):
    """
    Uses spaCy to extract named entities from the text.
    Returns a list of (entity_text, entity_label).
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def analyze_text_sentiment(text):
    """
    Uses a Hugging Face sentiment pipeline.
    Returns (label, score).
    """
    result = sentiment_pipeline(text[:512])  # limit to first 512 characters if text is long
    # result is a list of dict: [{'label': 'POSITIVE'/'NEGATIVE', 'score': 0.999...}]
    return result[0]["label"], float(result[0]["score"])

# Process each document
entity_results = []
sentiment_results = []
for idx, row in df_competitor.iterrows():
    text = row["content"]
    competitor_name = row["competitor"]
    doc_type = row["doc_type"]
    date_issued = row["date"]

    # spaCy NER
    ents = analyze_text_spacy(text)
    # Sentiment
    label, score = analyze_text_sentiment(text)

    entity_results.append(ents)
    sentiment_results.append((label, score))

df_competitor["entities"] = entity_results
df_competitor["sentiment_label"] = [sr[0] for sr in sentiment_results]
df_competitor["sentiment_score"] = [sr[1] for sr in sentiment_results]

print("\nNLP Analysis Complete. Example rows:")
print(df_competitor[["competitor","date","doc_type","content","entities","sentiment_label","sentiment_score"]].head())

# ==========================================
#  5. TREND & METRICS CALCULATIONS
# ==========================================
# We'll assign a numeric polarity: POSITIVE => +1, NEGATIVE => -1, NEUTRAL => 0 (approx).
def sentiment_to_numeric(label):
    if label == "POSITIVE":
        return 1
    elif label == "NEGATIVE":
        return -1
    else:
        return 0

df_competitor["sentiment_numeric"] = df_competitor["sentiment_label"].apply(sentiment_to_numeric)

# Summarize average sentiment per competitor per week
sentiment_trends = df_competitor.groupby(["competitor","date"]).agg({
    "sentiment_numeric":"mean",
    "sentiment_score":"mean"
}).reset_index()

print("\nSentiment trend data (head):")
print(sentiment_trends.head())

# ==========================================
#  6. VISUALIZATION (Plotly)
# ==========================================
# We'll create a line chart showing average sentiment over time for each competitor.

fig = px.line(
    sentiment_trends,
    x="date",
    y="sentiment_numeric",
    color="competitor",
    title="Average Sentiment Trend by Competitor",
    markers=True
)
fig.update_yaxes(title="Avg Sentiment (POS=1, NEG=-1)")
fig.show()

# Let's also show a bar chart of doc types and their sentiment distribution
doc_type_sentiment = df_competitor.groupby(["doc_type","sentiment_label"]).size().reset_index(name="count")
fig2 = px.bar(
    doc_type_sentiment,
    x="doc_type",
    y="count",
    color="sentiment_label",
    title="Sentiment Distribution by Document Type"
)
fig2.show()

# ==========================================
#  7. OPTIONAL: TOP ENTITIES EXTRACTED
# ==========================================
# Flatten the list of entity tuples
all_entities = []
for idx, row in df_competitor.iterrows():
    for (ent_text, ent_label) in row["entities"]:
        all_entities.append((row["competitor"], row["date"], ent_text, ent_label))

df_entities = pd.DataFrame(all_entities, columns=["competitor","date","entity_text","entity_label"])
common_ents = df_entities.groupby(["entity_text","entity_label"]).size().reset_index(name="count")
common_ents.sort_values("count", ascending=False, inplace=True)

print("\nMost common entities in competitor documents (top 10):")
print(common_ents.head(10))

# ==========================================
#  8. MOCK "DASHBOARD" INSIGHT
# ==========================================
# If you were to integrate with Power BI, you'd typically export data to CSV
# or connect a data source. For demonstration, we show how to
# visualize in Python with Plotly.

# We'll create a multi-competitor timeline of doc counts
doc_counts = df_competitor.groupby(["competitor","date"]).size().reset_index(name="num_docs")
fig3 = px.bar(
    doc_counts,
    x="date",
    y="num_docs",
    color="competitor",
    title="Number of Documents (Releases/Reviews/Statements) Over Time",
)
fig3.show()

print("\nDone! We have demonstrated a basic NLP pipeline using spaCy (NER) and Hugging Face Transformers (sentiment),")
print("along with example visualizations that mimic a competitive intelligence dashboard.")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sample competitor data:
   competitor       date             doc_type  \
20      CompA 2023-01-01  financial_statement   
0       CompC 2023-01-01        press_release   
10      CompC 2023-01-01               review   
21      CompA 2023-01-08  financial_statement   
1       CompC 2023-01-08        press_release   
11      CompC 2023-01-08               review   
2       CompA 2023-01-15        press_release   
22      CompB 2023-01-15  financial_statement   
12      CompC 2023-01-15               review   
3       CompB 2023-01-22        press_release   

                                              content  
20  CompA declared quarterly earnings of 61 millio...  
0   CompC announced a new product line today. The ...  
10  Customers of CompC reported poor satisfaction ...  
21  CompA declared quarterly earnings of 122 milli...  
1   CompC announced a new product line today. The ...  
11  Customers of CompC reported average satisfacti...  
2   CompA announced a new product line today


Most common entities in competitor documents (top 10):
    entity_text entity_label  count
12    quarterly         DATE     20
13        today         DATE     10
10          USD          ORG      3
0   103 million     CARDINAL      1
1   121 million     CARDINAL      1
2   122 million     CARDINAL      1
3   129 million     CARDINAL      1
4   137 million     CARDINAL      1
5   152 million     CARDINAL      1
6   166 million     CARDINAL      1



Done! We have demonstrated a basic NLP pipeline using spaCy (NER) and Hugging Face Transformers (sentiment),
along with example visualizations that mimic a competitive intelligence dashboard.
