# Advanced Analysis Examples

This notebook demonstrates how to use the advanced_analysis module for analyzing and optimizing machine learning code.

In [None]:
# Import the necessary modules
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add the parent directory to the path to import advanced_analysis
sys.path.append('..')
from advanced_analysis.analyzer import Analyzer
from advanced_analysis.algorithm_complexity import StaticAnalyzer, DynamicAnalyzer, ComplexityAnalyzer
from advanced_analysis.data_quality import DataGuardian, DataQualityReport
from advanced_analysis.ml_advisor import MLAlgorithmRecognizer, InefficiencyDetector

## 1. Analyzing Code Complexity

Let's start by analyzing the complexity of some Python code.

In [None]:
# Define some sample code with different complexity patterns
sample_code = """
def linear_search(arr, target):
    for i in range(len(arr)):
        if arr[i] == target:
            return i
    return -1

def bubble_sort(arr):
    n = len(arr)
    for i in range(n):
        for j in range(0, n - i - 1):
            if arr[j] > arr[j + 1]:
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
    return arr

def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
"""

# Create a static analyzer and analyze the code
static_analyzer = StaticAnalyzer()
results = static_analyzer.analyze_code(sample_code)

# Display the results
print("Functions detected:")
for func_name, func_data in results["functions"].items():
    print(f"  {func_name}: {func_data['time_complexity']}")

print("\nOverall complexity:")
print(f"  Time complexity: {results['overall_time_complexity']}")
print(f"  Space complexity: {results['overall_space_complexity']}")

print("\nDetected patterns:")
for pattern in results["detected_patterns"]:
    print(f"  {pattern['pattern']}: {pattern['time_complexity']}")

## 2. Dynamic Analysis of Functions

Now, let's perform dynamic analysis to measure the actual performance of functions.

In [None]:
# Define functions with different complexities
def constant_time(n):
    return 1

def linear_time(n):
    total = 0
    for i in range(n):
        total += i
    return total

def quadratic_time(n):
    total = 0
    for i in range(n):
        for j in range(n):
            total += i * j
    return total

# Create a dynamic analyzer
dynamic_analyzer = DynamicAnalyzer()

# Define an input generator
def input_generator(size):
    return size

# Analyze the functions
sizes = [10, 100, 1000]
constant_results = dynamic_analyzer.analyze_function(constant_time, input_generator, sizes=sizes)
linear_results = dynamic_analyzer.analyze_function(linear_time, input_generator, sizes=sizes)
quadratic_results = dynamic_analyzer.analyze_function(quadratic_time, input_generator, sizes=sizes)

# Display the results
print("Constant time function:")
print(f"  Estimated complexity: {constant_results['summary']['time_complexity']}")

print("\nLinear time function:")
print(f"  Estimated complexity: {linear_results['summary']['time_complexity']}")

print("\nQuadratic time function:")
print(f"  Estimated complexity: {quadratic_results['summary']['time_complexity']}")

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(sizes, [constant_results['runtime']['average'][s] for s in sizes], 'o-', label='Constant')
plt.plot(sizes, [linear_results['runtime']['average'][s] for s in sizes], 'o-', label='Linear')
plt.plot(sizes, [quadratic_results['runtime']['average'][s] for s in sizes], 'o-', label='Quadratic')
plt.xlabel('Input Size')
plt.ylabel('Runtime (seconds)')
plt.title('Runtime vs. Input Size')
plt.legend()
plt.grid(True)
plt.show()

## 3. Comprehensive Function Analysis

Let's use the ComplexityAnalyzer to perform a comprehensive analysis of a function.

In [None]:
# Create a complexity analyzer for the quadratic time function
complexity_analyzer = ComplexityAnalyzer(quadratic_time)

# Analyze the function
analysis_results = complexity_analyzer.analyze(inputs=[10, 100, 1000])

# Display the results
print("Function name:", analysis_results["function_name"])
print("\nTheoretical complexity:")
print(f"  Big-O: {analysis_results['theoretical_complexity']['big_o']}")
print(f"  Big-Theta: {analysis_results['theoretical_complexity']['big_theta']}")
print(f"  Big-Omega: {analysis_results['theoretical_complexity']['big_omega']}")

print("\nEmpirical performance:")
print("  Time measurements:")
for size, time in analysis_results["empirical_performance"]["time_measurements"]:
    print(f"    Input size {size}: {time:.6f} seconds")

print("\nOptimization suggestions:")
for suggestion in analysis_results["optimization_suggestions"]:
    print(f"  [{suggestion['severity']}] {suggestion['message']}")
    print(f"    {suggestion['details']}")
    if "code_example" in suggestion:
        print(f"    Example:\n{suggestion['code_example']}")

## 4. Data Quality Analysis

Let's analyze the quality of a dataset.

In [None]:
# Create a sample DataFrame with various data quality issues
df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 5, 7],  # Duplicate value in id
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', None],  # Missing value in name
    'age': [25, 30, 35, 40, 45, 50, 200],  # Outlier in age
    'salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000],
    'department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'IT']
})

# Display the DataFrame
print("Sample DataFrame:")
display(df)

# Create a data guardian and analyze the data
guardian = DataGuardian()
report = guardian.generate_report(df)

# Display the report
print("\nMissing values:")
print(f"  Total missing: {report.missing_values['total_missing']}")
print(f"  Missing by column: {report.missing_values['missing_by_column']}")

print("\nDuplicates:")
print(f"  Total duplicates: {report.duplicates['total_duplicates']}")

print("\nOutliers:")
for column, outliers in report.outliers['outliers_by_column'].items():
    print(f"  {column}: {outliers}")

print("\nData types:")
for column, dtype in report.data_types['column_types'].items():
    print(f"  {column}: {dtype}")

print("\nValue distribution:")
for column, distribution in report.value_distribution['distribution_by_column'].items():
    print(f"  {column}: {distribution}")

# Generate an HTML report
html_report = report.to_html()
from IPython.display import HTML
HTML(html_report)

## 5. ML Algorithm Recognition

Let's identify ML algorithms in code.

In [None]:
# Define some ML code
ml_code = """
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import torch
import torch.nn as nn

def train_linear_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

def train_classifier(X, y):
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    return model

def train_forest(X, y):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X, y)
    return model

def train_svm(X, y):
    model = SVC(kernel='rbf')
    model.fit(X, y)
    return model

class SimpleNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(10, 50),
            nn.ReLU(),
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )
    
    def forward(self, x):
        return self.layers(x)
"""

# Create an algorithm recognizer and analyze the code
recognizer = MLAlgorithmRecognizer(ml_code)
results = recognizer.analyze_code()

# Display the results
print("Identified algorithms:")
for algo in results["identified_algorithms"]:
    print(f"  {algo['algorithm']}")
    print(f"    Matched patterns: {algo['matched_patterns']}")
    print(f"    Complexity: {algo['std_complexity']}")

print("\nOptimization suggestions:")
for suggestion in results["optimization_suggestions"]:
    print(f"  {suggestion['algorithm']}: {suggestion['suggestion']}")

## 6. Inefficiency Detection

Let's detect inefficiencies in ML code.

In [None]:
# Define some inefficient ML code
inefficient_code = """
import pandas as pd
import numpy as np
import torch

def process_dataframe(df):
    results = []
    for index, row in df.iterrows():
        results.append(row['value'] * 2)
    return results

def compute_pairwise_distances(data):
    n = len(data)
    distances = []
    for i in range(n):
        for j in range(n):
            if i != j:
                distances.append(abs(data[i] - data[j]))
    return distances

def train_epoch(model, dataloader, criterion, optimizer):
    for batch in dataloader:
        inputs, targets = batch
        inputs = inputs.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, targets.cuda())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Move back to CPU for processing
        outputs = outputs.cpu()
"""

# Create an inefficiency detector and analyze the code
detector = InefficiencyDetector(inefficient_code)
results = detector.analyze_code()

# Display the results
print("Detected inefficiencies:")
for inefficiency in results["detected_inefficiencies"]:
    print(f"  {inefficiency['name']}: {inefficiency['description']}")
    print(f"    Severity: {inefficiency['severity']}")
    print(f"    Suggestion: {inefficiency['suggestion']}")

print("\nOptimization suggestions:")
for suggestion in results["optimization_suggestions"]:
    print(f"  [{suggestion['severity']}] {suggestion['message']}")
    print(f"    {suggestion['details']}")
    print(f"    Example:\n{suggestion['code_example']}")

## 7. Using the Main Analyzer

Finally, let's use the main Analyzer to perform a comprehensive analysis.

In [None]:
# Create an analyzer
analyzer = Analyzer()

# Analyze the inefficient code
results = analyzer.analyze_code(inefficient_code)

# Display the results
print("Static analysis:")
print(f"  Overall time complexity: {results['static_analysis']['overall_time_complexity']}")

print("\nAlgorithm recognition:")
if "identified_algorithms" in results["algorithm_recognition"]:
    for algo in results["algorithm_recognition"]["identified_algorithms"]:
        print(f"  {algo['algorithm']}")
else:
    print("  No algorithms identified")

print("\nVectorization analysis:")
print(f"  Naive loops: {len(results['vectorization_analysis']['naive_loops'])}")

print("\nInefficiencies:")
for inefficiency in results["inefficiencies"]:
    print(f"  {inefficiency['type']}: {inefficiency.get('pattern', '')}")

print("\nOptimization suggestions:")
for suggestion in results["optimization_suggestions"]:
    if isinstance(suggestion, dict) and "message" in suggestion:
        print(f"  {suggestion['message']}")
    elif isinstance(suggestion, dict) and "suggestion" in suggestion:
        print(f"  {suggestion['suggestion']}")
    else:
        print(f"  {suggestion}")

# Generate a report
report = analyzer.generate_report(results, format="html")
HTML(report)