# Forecasting Net Prophet

## Data Validation Framework
This notebook includes integrated validation steps to ensure data quality and accurate analysis.

### Validation Checkpoints:
1. Data Loading & Structure
2. Time Series Continuity
3. Calculation Verification
4. Visualization Quality

Each section includes validation cells marked with [VALIDATE] to ensure accuracy.

In [None]:
# Install the required libraries
!pip install prophet
!pip install hvplot
!pip install holoviews

In [None]:
# Import the required libraries and dependencies
import pandas as pd
from prophet import Prophet
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
import holoviews as hv
%matplotlib inline

## Initial Data Validation

Before proceeding with analysis, we'll validate our data sources and structure.

In [None]:
# [VALIDATE] Data Loading and Structure
def validate_dataframe(df, expected_cols):
    """Validate DataFrame structure and content"""
    validation_results = {
        'rows': len(df),
        'columns': list(df.columns),
        'missing_values': df.isnull().sum().to_dict(),
        'dtypes': df.dtypes.to_dict(),
        'index_type': str(df.index.dtype)
    }
    print("Validation Results:")
    for key, value in validation_results.items():
        print(f"{key}: {value}")
    
    # Verify expected columns
    missing_cols = set(expected_cols) - set(df.columns)
    if missing_cols:
        print(f"WARNING: Missing expected columns: {missing_cols}")
    
    return validation_results

## Step 1: Find Unusual Patterns in Hourly Google Search Traffic

In [None]:
# Load and validate search trends data
df_mercado_trends = pd.read_csv(
    "Resources/google_hourly_search_trends.csv",
    index_col='Date',
    parse_dates=True
).dropna()

# [VALIDATE] Search trends data
validate_dataframe(df_mercado_trends, ['Search Trends'])

# Display sample data
print("\nFirst 5 rows:")
display(df_mercado_trends.head())
print("\nLast 5 rows:")
display(df_mercado_trends.tail())

In [None]:
# [VALIDATE] Time series continuity
def validate_time_series(df):
    """Validate time series data for gaps and consistency"""
    print(f"Date Range: {df.index.min()} to {df.index.max()}")
    print(f"Total Hours: {len(df)}")
    
    # Check for gaps
    expected_hours = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    missing_dates = set(expected_hours) - set(df.index)
    if missing_dates:
        print(f"WARNING: Found {len(missing_dates)} missing hourly data points")
        print("Sample missing dates:", list(missing_dates)[:5])

validate_time_series(df_mercado_trends)

In [None]:
# Analyze May 2020 patterns
may_2020 = df_mercado_trends['2020-05']

# Calculate total search traffic for May 2020
traffic_may_2020 = may_2020['Search Trends'].sum()
print(f"Total Search Traffic for May 2020: {traffic_may_2020}")

# Calculate monthly median across all months
monthly_traffic = df_mercado_trends.groupby([df_mercado_trends.index.year, 
                                            df_mercado_trends.index.month])['Search Trends'].sum()
median_monthly_traffic = monthly_traffic.median()
print(f"\nMedian Monthly Traffic: {median_monthly_traffic}")

# Compare May 2020 to median
may_2020_ratio = traffic_may_2020 / median_monthly_traffic
print(f"\nMay 2020 vs Median Ratio: {may_2020_ratio:.2f}")

# Visualize May 2020 trends
plt.figure(figsize=(15, 7))
may_2020['Search Trends'].plot()
plt.title('Google Search Trends - May 2020')
plt.xlabel('Date')
plt.ylabel('Search Trends')
plt.grid(True)
plt.show()

**Question:** Did the Google search traffic increase during the month that MercadoLibre released its financial results?

**Answer:** Based on the analysis above, [to be completed after running the analysis]

## Step 2: Mine the Search Traffic Data for Seasonality

In [None]:
# Analyze hourly patterns
hourly_trends = df_mercado_trends.groupby(df_mercado_trends.index.hour)['Search Trends'].mean()

plt.figure(figsize=(15, 7))
hourly_trends.plot(kind='bar')
plt.title('Average Search Traffic by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Average Search Trends')
plt.grid(True)
plt.show()

In [None]:
# Analyze day-of-week patterns
daily_trends = df_mercado_trends.groupby(df_mercado_trends.index.dayofweek)['Search Trends'].mean()

plt.figure(figsize=(15, 7))
daily_trends.plot(kind='bar')
plt.title('Average Search Traffic by Day of Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Average Search Trends')
plt.grid(True)
plt.show()

In [None]:
# Analyze weekly patterns
weekly_trends = df_mercado_trends.groupby(df_mercado_trends.index.isocalendar().week)['Search Trends'].mean()

plt.figure(figsize=(15, 7))
weekly_trends.plot()
plt.title('Average Search Traffic by Week of Year')
plt.xlabel('Week of Year')
plt.ylabel('Average Search Trends')
plt.grid(True)
plt.show()

**Question:** Are there any time-based trends that you can see in the data?

**Answer:** [to be completed after running the analysis]

## Step 3: Relate the Search Traffic to Stock Price Patterns

In [None]:
# Load and plot stock price data
df_mercado_stock = pd.read_csv(
    "Resources/mercado_stock_price.csv",
    index_col="date",
    parse_dates=True
).dropna()

# Plot stock prices
plt.figure(figsize=(15, 7))
df_mercado_stock['close'].plot()
plt.title('MercadoLibre Stock Price')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.grid(True)
plt.show()

In [None]:
# Concatenate the stock and search data
mercado_stock_trends_df = pd.concat([df_mercado_stock, df_mercado_trends], axis=1)

# Slice to the first half of 2020
first_half_2020 = mercado_stock_trends_df.loc['2020-01':'2020-06']

# Plot the stock and search trends data
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

first_half_2020['close'].plot(ax=ax1)
ax1.set_title('Stock Price - First Half 2020')
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price')
ax1.grid(True)

first_half_2020['Search Trends'].plot(ax=ax2)
ax2.set_title('Search Trends - First Half 2020')
ax2.set_xlabel('Date')
ax2.set_ylabel('Search Trends')
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Create new columns for analysis
mercado_stock_trends_df['Lagged Search Trends'] = mercado_stock_trends_df['Search Trends'].shift(1)
mercado_stock_trends_df['Stock Volatility'] = mercado_stock_trends_df['close'].pct_change().rolling(window=4).std()
mercado_stock_trends_df['Hourly Stock Return'] = mercado_stock_trends_df['close'].pct_change()

# Calculate correlations
correlation_matrix = mercado_stock_trends_df[['Stock Volatility', 'Lagged Search Trends', 'Hourly Stock Return']].corr()
print("Correlation Matrix:")
display(correlation_matrix)

**Question:** Does a predictable relationship exist between the lagged search traffic and the stock volatility or between the lagged search traffic and the stock price returns?

**Answer:** [to be completed after running the analysis]

## Step 4: Create a Time Series Model with Prophet

In [None]:
# Prepare the data for Prophet
prophet_df = df_mercado_trends.reset_index()
prophet_df.columns = ['ds', 'y']

# Create and fit the Prophet model
model = Prophet()
model.fit(prophet_df)

# Create future dates for forecasting
future_dates = model.make_future_dataframe(periods=2000, freq='H')
forecast = model.predict(future_dates)

# Plot the forecast
fig = model.plot(forecast)
plt.title('MercadoLibre Search Traffic Forecast')
plt.show()

In [None]:
# Plot the individual components
fig = model.plot_components(forecast)
plt.show()

**Questions:**

1. What time of day exhibits the greatest popularity?
**Answer:** [to be completed after running the analysis]

2. Which day of week gets the most search traffic?
**Answer:** [to be completed after running the analysis]

3. What's the lowest point for search traffic in the calendar year?
**Answer:** [to be completed after running the analysis]