In [29]:
# Import necessary libraries
import spacy
import pandas as pd
from collections import defaultdict
import re

In [30]:
# load sample data
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample earnings call transcript
sample_transcript = """
Good afternoon, everyone. This is Tim Cook, CEO of TechCorp.
We're pleased to report strong Q4 results with revenue of $90.1 billion.
Our cloud segment saw tremendous growth of 35% year-over-year.
We now have 2.5 billion monthly active users across our platforms.
Despite macro challenges, our AI initiatives are showing promising results.
"""

### Define the TechEarningsAnalyzer Class

In [31]:
class TechEarningsAnalyzer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        
        # Custom financial metrics patterns
        self.metric_patterns = {
            'revenue': r'\$?\d+\.?\d*\s*(billion|million|B|M)',
            'growth': r'\d+\.?\d*\s*%',
            'margin': r'\d+\.?\d*\s*%\s*margin',
            'users': r'\d+\.?\d*\s*(million|billion)?\s*(monthly active users|MAU|daily active users|DAU)',
        }
        
        # Tech-specific keywords
        self.tech_segments = {
            'cloud': ['cloud', 'aws', 'azure', 'gcp'],
            'ai': ['ai', 'artificial intelligence', 'machine learning', 'ml'],
            'hardware': ['iphone', 'device', 'hardware', 'macbook', 'surface'],
            'services': ['subscription', 'saas', 'services'],
        }
        
        self.exec_titles = ['CEO', 'CTO', 'CFO', 'COO', 'President', 'Director']

    def preprocess_text(self, text):
        """Clean and normalize the transcript text"""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
        
    def extract_metrics(self, text):
        """Extract key financial and operational metrics"""
        metrics = defaultdict(list)
        doc = self.nlp(text)
        
        for metric_type, pattern in self.metric_patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                metrics[metric_type].append(match.group())
        
        return dict(metrics)
        
    def analyze_segment_mentions(self, text):
        """Analyze mentions and context of different tech segments"""
        segment_analysis = defaultdict(lambda: {'mentions': 0, 'context': []})
        doc = self.nlp(text)
        
        for segment, keywords in self.tech_segments.items():
            for sent in doc.sents:
                sent_text = sent.text.lower()
                for keyword in keywords:
                    if keyword in sent_text:
                        segment_analysis[segment]['mentions'] += 1
                        segment_analysis[segment]['context'].append(sent.text)
        
        return dict(segment_analysis)
        
    def extract_speakers(self, text):
        """Extract speakers and their quotes"""
        speakers = defaultdict(list)
        doc = self.nlp(text)
        
        for sent in doc.sents:
            for ent in sent.ents:
                if ent.label_ == "PERSON":
                    next_words = ent.doc[ent.end:ent.end + 3].text
                    for title in self.exec_titles:
                        if title in next_words:
                            speakers[f"{ent.text} ({title})"].append(sent.text)
        
        return dict(speakers)

    def sentiment_analysis(self, text):
        """Analyze sentiment in the earnings call"""
        doc = self.nlp(text)
        
        positive_words = ['growth', 'increase', 'improved', 'strong', 'success']
        negative_words = ['decline', 'decrease', 'challenge', 'difficult', 'loss']
        
        sentiment = {
            'positive_sentences': [],
            'negative_sentences': [],
            'overall_tone': 'neutral'
        }
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            pos_count = sum(1 for word in positive_words if word in sent_text)
            neg_count = sum(1 for word in negative_words if word in sent_text)
            
            if pos_count > neg_count:
                sentiment['positive_sentences'].append(sent.text)
            elif neg_count > pos_count:
                sentiment['negative_sentences'].append(sent.text)
        
        sentiment['overall_tone'] = 'positive' if len(sentiment['positive_sentences']) > len(sentiment['negative_sentences']) else 'negative'
        return sentiment
        
    def generate_report(self, text):
        """Generate a comprehensive analysis report"""
        text = self.preprocess_text(text)
        
        report = {
            'metrics': self.extract_metrics(text),
            'segment_analysis': self.analyze_segment_mentions(text),
            'speakers': self.extract_speakers(text),
            'sentiment': self.sentiment_analysis(text)
        }
        
        return report
        
    def save_report(self, report, filename):
        """Save the analysis report to Excel"""
        with pd.ExcelWriter(filename) as writer:
            pd.DataFrame(report['metrics']).to_excel(writer, sheet_name='Metrics')
            pd.DataFrame(report['segment_analysis']).to_excel(writer, sheet_name='Segment Analysis')
            pd.DataFrame(report['speakers']).to_excel(writer, sheet_name='Speakers')
            pd.DataFrame(report['sentiment']).to_excel(writer, sheet_name='Sentiment')

### Implement Metric Extraction

In [32]:
def extract_metrics(self, text):
        """Extract key financial and operational metrics"""
        metrics = defaultdict(list)
        doc = self.nlp(text)
        
        for metric_type, pattern in self.metric_patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                metrics[metric_type].append(match.group())
        
        return dict(metrics)

# Test metric extraction
analyzer = TechEarningsAnalyzer()
metrics = analyzer.extract_metrics(sample_transcript)
print("Extracted Metrics:")
pd.DataFrame(metrics)

Extracted Metrics:


ValueError: All arrays must be of the same length

### Implement Segment Analysis

In [None]:
def analyze_segment_mentions(self, text):
        """Analyze mentions and context of different tech segments"""
        segment_analysis = defaultdict(lambda: {'mentions': 0, 'context': []})
        doc = self.nlp(text)
        
        for segment, keywords in self.tech_segments.items():
            for sent in doc.sents:
                sent_text = sent.text.lower()
                for keyword in keywords:
                    if keyword in sent_text:
                        segment_analysis[segment]['mentions'] += 1
                        segment_analysis[segment]['context'].append(sent.text)
        
        return dict(segment_analysis)

# Test segment analysis
segments = analyzer.analyze_segment_mentions(sample_transcript)
print("\nSegment Analysis:")
pd.DataFrame(segments).T

### Implement Speaker and Sentiment Analysis 

In [None]:
def extract_speakers(self, text):
        """Extract speakers and their quotes"""
        speakers = defaultdict(list)
        doc = self.nlp(text)
        
        for sent in doc.sents:
            for ent in sent.ents:
                if ent.label_ == "PERSON":
                    next_words = ent.doc[ent.end:ent.end + 3].text
                    for title in self.exec_titles:
                        if title in next_words:
                            speakers[f"{ent.text} ({title})"].append(sent.text)
        
        return dict(speakers)

    def sentiment_analysis(self, text):
        """Analyze sentiment in the earnings call"""
        doc = self.nlp(text)
        
        positive_words = ['growth', 'increase', 'improved', 'strong', 'success']
        negative_words = ['decline', 'decrease', 'challenge', 'difficult', 'loss']
        
        sentiment = {
            'positive_sentences': [],
            'negative_sentences': [],
            'overall_tone': 'neutral'
        }
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            pos_count = sum(1 for word in positive_words if word in sent_text)
            neg_count = sum(1 for word in negative_words if word in sent_text)
            
            if pos_count > neg_count:
                sentiment['positive_sentences'].append(sent.text)
            elif neg_count > pos_count:
                sentiment['negative_sentences'].append(sent.text)
        
        sentiment['overall_tone'] = 'positive' if len(sentiment['positive_sentences']) > len(sentiment['negative_sentences']) else 'negative'
        return sentiment

# Test speaker and sentiment analysis
speakers = analyzer.extract_speakers(sample_transcript)
sentiment = analyzer.sentiment_analysis(sample_transcript)

print("\nSpeaker Analysis:")
pd.DataFrame(speakers)

print("\nSentiment Analysis:")
print(f"Overall Tone: {sentiment['overall_tone']}")
print("\nPositive Sentences:")
for sent in sentiment['positive_sentences']:
    print(f"- {sent}")

### Generate Complete Report

In [None]:
def generate_report(self, text):
        """Generate a comprehensive analysis report"""
        text = self.preprocess_text(text)
        
        report = {
            'metrics': self.extract_metrics(text),
            'segment_analysis': self.analyze_segment_mentions(text),
            'speakers': self.extract_speakers(text),
            'sentiment': self.sentiment_analysis(text)
        }
        
        return report

# Generate complete report
report = analyzer.generate_report(sample_transcript)

# Display results in a notebook-friendly way
print("Complete Analysis Report:")
print("\n1. Metrics:")
pd.DataFrame(report['metrics'])

print("\n2. Segment Analysis:")
pd.DataFrame(report['segment_analysis']).T

print("\n3. Speakers:")
pd.DataFrame(report['speakers'])

print("\n4. Sentiment:")
print(f"Overall Tone: {report['sentiment']['overall_tone']}")

In [34]:
import spacy
import pandas as pd
from collections import defaultdict
import re

class TechEarningsAnalyzer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        
        # Custom financial metrics patterns
        self.metric_patterns = {
            'revenue': r'\$?\d+\.?\d*\s*(billion|million|B|M)',
            'growth': r'\d+\.?\d*\s*%',
            'margin': r'\d+\.?\d*\s*%\s*margin',
            'users': r'\d+\.?\d*\s*(million|billion)?\s*(monthly active users|MAU|daily active users|DAU)',
        }
        
        # Tech-specific keywords
        self.tech_segments = {
            'cloud': ['cloud', 'aws', 'azure', 'gcp'],
            'ai': ['ai', 'artificial intelligence', 'machine learning', 'ml'],
            'hardware': ['iphone', 'device', 'hardware', 'macbook', 'surface'],
            'services': ['subscription', 'saas', 'services'],
        }
        
        self.exec_titles = ['CEO', 'CTO', 'CFO', 'COO', 'President', 'Director']

    def preprocess_text(self, text):
        """Clean and normalize the transcript text"""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
        
    def extract_metrics(self, text):
        """Extract key financial and operational metrics"""
        metrics = defaultdict(list)
        doc = self.nlp(text)
        
        for metric_type, pattern in self.metric_patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                metrics[metric_type].append(match.group())
        
        # Convert lists to padded lists of equal length
        max_length = max(len(v) for v in metrics.values()) if metrics else 0
        for key in metrics:
            metrics[key] = metrics[key] + [None] * (max_length - len(metrics[key]))
            
        return dict(metrics)
        
    def analyze_segment_mentions(self, text):
        """Analyze mentions and context of different tech segments"""
        segment_analysis = defaultdict(lambda: {'mentions': 0, 'context': []})
        doc = self.nlp(text)
        
        for segment, keywords in self.tech_segments.items():
            for sent in doc.sents:
                sent_text = sent.text.lower()
                for keyword in keywords:
                    if keyword in sent_text:
                        segment_analysis[segment]['mentions'] += 1
                        segment_analysis[segment]['context'].append(sent.text)
        
        return dict(segment_analysis)
        
    def extract_speakers(self, text):
        """Extract speakers and their quotes"""
        speakers = defaultdict(list)
        doc = self.nlp(text)
        
        for sent in doc.sents:
            for ent in sent.ents:
                if ent.label_ == "PERSON":
                    next_words = ent.doc[ent.end:ent.end + 3].text
                    for title in self.exec_titles:
                        if title in next_words:
                            speakers[f"{ent.text} ({title})"].append(sent.text)
        
        return dict(speakers)

    def sentiment_analysis(self, text):
        """Analyze sentiment in the earnings call"""
        doc = self.nlp(text)
        
        positive_words = ['growth', 'increase', 'improved', 'strong', 'success']
        negative_words = ['decline', 'decrease', 'challenge', 'difficult', 'loss']
        
        sentiment = {
            'positive_sentences': [],
            'negative_sentences': [],
            'overall_tone': 'neutral'
        }
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            pos_count = sum(1 for word in positive_words if word in sent_text)
            neg_count = sum(1 for word in negative_words if word in sent_text)
            
            if pos_count > neg_count:
                sentiment['positive_sentences'].append(sent.text)
            elif neg_count > pos_count:
                sentiment['negative_sentences'].append(sent.text)
        
        sentiment['overall_tone'] = 'positive' if len(sentiment['positive_sentences']) > len(sentiment['negative_sentences']) else 'negative'
        return sentiment
        
    def generate_report(self, text):
        """Generate a comprehensive analysis report"""
        text = self.preprocess_text(text)
        
        metrics = self.extract_metrics(text)
        segments = self.analyze_segment_mentions(text)
        speakers = self.extract_speakers(text)
        sentiment_results = self.sentiment_analysis(text)
        
        return {
            'metrics': pd.DataFrame(metrics),
            'segment_analysis': pd.DataFrame({k: v['mentions'] for k, v in segments.items()}, index=['mentions']).T,
            'speakers': pd.DataFrame(speakers),
            'sentiment': pd.DataFrame({
                'tone': [sentiment_results['overall_tone']],
                'positive_count': [len(sentiment_results['positive_sentences'])],
                'negative_count': [len(sentiment_results['negative_sentences'])]
            })
        }

# Test the analyzer
sample_transcript = """
Good afternoon, everyone. This is Tim Cook, CEO of TechCorp.
We're pleased to report strong Q4 results with revenue of $910.1 billion.
Our cloud segment saw tremendous growth of 35% year-over-year.
We now have 2.5 billion monthly active users across our platforms.
Despite macro challenges, our AI initiatives are showing promising results.
"""

# Create analyzer instance and generate report
analyzer = TechEarningsAnalyzer()
report = analyzer.generate_report(sample_transcript)

# Print results
print("Analysis Report:")
print("\n1. Metrics:")
print(report['metrics'])

print("\n2. Segment Analysis:")
print(report['segment_analysis'])

print("\n3. Speakers:")
print(report['speakers'])

print("\n4. Sentiment:")
print(report['sentiment'])

Analysis Report:

1. Metrics:
          revenue growth                             users
0  $910.1 billion    35%  2.5 billion monthly active users
1     2.5 billion   None                              None

2. Segment Analysis:
       mentions
cloud         1
ai            1

3. Speakers:
                       Tim Cook (CEO)
0  This is Tim Cook, CEO of TechCorp.

4. Sentiment:
       tone  positive_count  negative_count
0  positive               2               1
