In [None]:
# Multi Regression Analysis on Test Table 

from google.cloud import bigquery
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Initialize BigQuery client
client = bigquery.Client()

# Define the query to fetch relevant columns from BigQuery
query = """
SELECT 
    ticker,
    Avg_Next_Daily_Percent_Difference,  
    Avg_AI_Score,  
    Avg_Sentiment_Score,  
    Avg_Health_Score
FROM `trendsense.combined_data.step_4_test_train`
WHERE Avg_Next_Daily_Percent_Difference IS NOT NULL
"""
# Load data into a Pandas DataFrame
df = client.query(query).to_dataframe()

# Ensure data types are correct
df = df.dropna()  # Remove rows with missing values

# Store results
results_dict = {}

# Loop through each unique ticker and fit a separate regression model
for ticker in df["ticker"].unique():
    ticker_data = df[df["ticker"] == ticker]

    # Define independent variables (X) and dependent variable (Y)
    X = ticker_data[[ "Avg_AI_Score", "Avg_Sentiment_Score", "Avg_Health_Score"]]
    y = ticker_data["Avg_Next_Daily_Percent_Difference"]

    # Add constant for intercept
    X = sm.add_constant(X)

    # Fit the model using OLS regression
    model = sm.OLS(y, X).fit()

    # Store results
    results_dict[ticker] = {
        "r_squared": model.rsquared,
        "coefficients": model.params,
        "p_values": model.pvalues
    }

    # Print summary for each ticker
    print(f"Ticker: {ticker}")
    print(model.summary())
    print("-" * 80)

# Convert results into a DataFrame for easy analysis
results_df = pd.DataFrame.from_dict(results_dict, orient="index")

# Display the summary of results
import ace_tools as tools
tools.display_dataframe_to_user(name="Regression Results", dataframe=results_df)


In [None]:
# Multi Regression Analysis on Table 2 per article and 60 min change (This is the best dependent) time With Filters.

from google.cloud import bigquery
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Initialize BigQuery client
client = bigquery.Client()


DEPENDENT_VAR = "Forward_60min_Change"

# Define the query to fetch relevant columns from BigQuery
query = """
SELECT 
    ticker,
    Forward_60min_Change_Diff,
    Forward_15min_Change_Diff,
    Forward_30min_Change_Diff,
    Forward_45min_Change_Diff,
    Forward_60min_Change, 
    `AI Score`,  
    RatingScore,  
    analyst_score,  
    article_sentiment
FROM `trendsense.combined_data.step_2_transform_AI`
WHERE Forward_60min_Change_Diff IS NOT NULL
"""
# Load data into a Pandas DataFrame
df = client.query(query).to_dataframe()

# Filter out outliers for the dependent variable
df = df[(df[DEPENDENT_VAR] >= -0.05) & (df[DEPENDENT_VAR] <= 0.05)]

# Drop rows where independent variables have NaN or infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=["AI Score", "RatingScore", "analyst_score", "article_sentiment"], inplace=True)

# Store results
results_dict = {}

# Loop through each unique ticker and fit a separate regression model
for ticker in df["ticker"].unique():
    ticker_data = df[df["ticker"] == ticker]

    # Define independent variables (X) and dependent variable (Y)
    X = ticker_data[["AI Score", "RatingScore", "analyst_score", "article_sentiment"]]
    y = ticker_data[DEPENDENT_VAR]

    # Skip tickers with too few observations
    if len(ticker_data) < 10:
        continue

    # Add constant for intercept
    X = sm.add_constant(X)

    # Fit the model using OLS regression
    model = sm.OLS(y, X).fit()

    # Store results
    results_dict[ticker] = {
        "r_squared": model.rsquared,
        "coefficients": model.params.to_dict(),
        "p_values": model.pvalues.to_dict()
    }

    # Print summary for each ticker
    print(f"Ticker: {ticker}")
    print(model.summary())
    print("-" * 80)

# Convert results into a DataFrame for easy analysis
results_df = pd.DataFrame.from_dict(results_dict, orient="index")

# Save to CSV
results_df.to_csv("filtered_regression_results_per_ticker.csv", index=True)

print("Results saved to filtered_regression_results_per_ticker.csv")




In [None]:
from google.cloud import bigquery
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize BigQuery client
client = bigquery.Client()

# Define table reference
TABLE_NAME = "trendsense.combined_data.step_4_test_train"

# Query to fetch relevant columns
query = f"""
SELECT ticker, Avg_Aggregated_Score, Avg_Next_Daily_Percent_Difference
FROM `{TABLE_NAME}`
WHERE Avg_Aggregated_Score IS NOT NULL 
AND Avg_Next_Daily_Percent_Difference IS NOT NULL
"""

# Load data into pandas DataFrame
df = client.query(query).to_dataframe()

# Initialize list to store per-ticker results
all_results = []

# Iterate over unique tickers
for ticker in df['ticker'].unique():
    df_ticker = df[df['ticker'] == ticker].copy()

    # Define X (independent) and y (dependent) variables
    X = df_ticker[['Avg_Aggregated_Score']]
    y = df_ticker['Avg_Next_Daily_Percent_Difference']

    # Split data into training (80%) and testing (20%) sets
    if len(X) > 1:  # Ensure enough data points
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and fit linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Generate predictions
        y_pred = model.predict(X_test)

        # Evaluate model performance
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        print(f"Ticker: {ticker} | R²: {r2:.4f} | MSE: {mse:.4f}")

        # Store results in DataFrame
        results_df = X_test.copy()
        results_df['Ticker'] = ticker
        results_df['Predicted_Next_Daily_Percent_Difference'] = y_pred
        results_df['Actual_Next_Daily_Percent_Difference'] = y_test.values
        results_df['R2_Score'] = r2

        all_results.append(results_df)

# Concatenate all results
final_df = pd.concat(all_results, ignore_index=True)

# Save results back to BigQuery
TARGET_TABLE = "trendsense.combined_data.step_4_predictions"
job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE)

# Upload DataFrame to BigQuery
client.load_table_from_dataframe(final_df, TARGET_TABLE, job_config=job_config)

print(f"Predictions saved to BigQuery table: {TARGET_TABLE}")


In [13]:
#Test code is my test regression analysis code for creating predictive formula

import pandas as pd
from sklearn.linear_model import LinearRegression

# Load local test and train CSV
file_path = "test_train.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Drop rows with missing values
df = df.dropna(subset=["Avg_Aggregated_Score", "Avg_Next_Daily_Percent_Difference"])

# Initialize list to store regression results
regression_results = []

# Iterate over unique tickers
for ticker in df['ticker'].unique():
    df_ticker = df[df['ticker'] == ticker].copy()

    if len(df_ticker) > 1:  # Ensure enough data points
        X = df_ticker[['Avg_Aggregated_Score']]
        y = df_ticker['Avg_Next_Daily_Percent_Difference']

        model = LinearRegression()
        model.fit(X, y)

        # Store regression coefficients
        regression_results.append({
            'Ticker': ticker,
            'Intercept': model.intercept_,
            'Coefficient': model.coef_[0]
        })

# Convert to DataFrame
regression_df = pd.DataFrame(regression_results)

# Save regression results locally
regression_df.to_csv("regression_coefficients.csv", index=False)
print("Regression coefficients saved to regression_coefficients.csv")

# **Load coefficients and predict new values**
coefficients = pd.read_csv("regression_coefficients.csv")

# Merge new data with coefficients
merged_df = df.merge(coefficients, left_on="ticker", right_on="Ticker", how="inner")

# Apply regression formula: Prediction = Intercept + (Coefficient * Avg_Aggregated_Score)
merged_df["Predicted_Next_Daily_Percent_Difference"] = (
    merged_df["Intercept"] + merged_df["Coefficient"] * merged_df["Avg_Aggregated_Score"]
)

# Save predictions locally
merged_df.to_csv("test_train_with_predictions.csv", index=False)
print("Predictions saved to test_train_with_predictions.csv")


Regression coefficients saved to regression_coefficients.csv
Predictions saved to test_train_with_predictions.csv


In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load local test/train dataset
file_path = "test_train.csv"  # Ensure correct path
df = pd.read_csv(file_path)

# Remove rows where Avg_Aggregated_Score is NaN or 0
df = df[df["Avg_Aggregated_Score"].notna() & (df["Avg_Aggregated_Score"] != 0)]

# Remove rows where Avg_Next_Daily_Percent_Difference is NaN
df = df[df["Avg_Next_Daily_Percent_Difference"].notna()]

# Initialize list to store regression results per ticker
regression_results = []

# Iterate over each unique ticker and train regression model
for ticker in df["ticker"].unique():
    df_ticker = df[df["ticker"] == ticker].copy()

    # Ensure enough data points for training
    if len(df_ticker) > 1:
        X = df_ticker[['Avg_Aggregated_Score']]
        y = df_ticker['Avg_Next_Daily_Percent_Difference']

        # Train regression model
        model = LinearRegression()
        model.fit(X, y)

        # Store regression coefficients for future predictions
        regression_results.append({
            'Ticker': ticker,
            'Intercept': model.intercept_,
            'Coefficient': model.coef_[0]
        })
    else:
        print(f"⚠️ Skipping {ticker} (Not enough data points)")

# Convert results into a DataFrame and save coefficients
regression_df = pd.DataFrame(regression_results)
regression_df.to_csv("regression_coefficients.csv", index=False)
print("✅ Regression coefficients saved to regression_coefficients.csv")

# Merge test/train data with regression coefficients
merged_df = df.merge(regression_df, left_on="ticker", right_on="Ticker", how="inner")

# Apply regression formula to predict Avg_Next_Daily_Percent_Difference
merged_df["Predicted_Next_Daily_Percent_Difference"] = (
    merged_df["Intercept"] + merged_df["Coefficient"] * merged_df["Avg_Aggregated_Score"]
)

# Save updated test/train dataset with predictions
merged_df.to_csv("test_train_with_predictions.csv", index=False)
print("✅ Updated dataset saved to test_train_with_predictions.csv with predicted values.")



✅ Regression coefficients saved to regression_coefficients.csv
✅ Updated dataset saved to test_train_with_predictions.csv with predicted values.
