<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/St_JohnsClimate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from scipy.stats import chisquare, norm
import numpy as np

# Load the dataset
climate_data = pd.read_csv('Canadian_climate_history.csv')

# Drop missing values from the 'MEAN_TEMPERATURE_STJOHNS' column
stjohns_temps = climate_data['MEAN_TEMPERATURE_STJOHNS'].dropna()

# Fit a normal distribution to the data
stjohns_mean, stjohns_std = norm.fit(stjohns_temps)

# Define bins and calculate observed frequencies
num_bins = 10  # You can adjust the number of bins
observed_freq, bin_edges = np.histogram(stjohns_temps, bins=num_bins)

# Calculate expected frequencies for each bin using the normal distribution
cdf_vals = norm.cdf(bin_edges, stjohns_mean, stjohns_std)
expected_freq = np.diff(cdf_vals) * len(stjohns_temps)

# Normalize frequencies to have the same sum
expected_freq = expected_freq * (observed_freq.sum() / expected_freq.sum())

# Perform the Chi-Square Goodness-of-Fit Test
chi2_stat, p_value = chisquare(observed_freq, expected_freq)

# Print the results
print(f"Chi-Square Statistic: {chi2_stat}, p-value: {p_value}")

Chi-Square Statistic: 1233.064626832027, p-value: 8.812274424332014e-260


In [4]:
import pandas as pd
from scipy.stats import chisquare, norm, gamma
import numpy as np

# Load the dataset
climate_data = pd.read_csv('Canadian_climate_history.csv')

# Drop missing values from the 'MEAN_TEMPERATURE_STJOHNS' column
stjohns_temps = climate_data['MEAN_TEMPERATURE_STJOHNS'].dropna()

# Define the number of bins
num_bins = 15  # Increased for smoother results

# Calculate observed frequencies
observed_freq, bin_edges = np.histogram(stjohns_temps, bins=num_bins)

# Fit a normal distribution and calculate expected frequencies
normal_mean, normal_std = norm.fit(stjohns_temps)
cdf_normal = norm.cdf(bin_edges, normal_mean, normal_std)
expected_freq_normal = np.diff(cdf_normal) * len(stjohns_temps)
expected_freq_normal *= observed_freq.sum() / expected_freq_normal.sum()

# Fit a gamma distribution and calculate expected frequencies
gamma_shape, gamma_loc, gamma_scale = gamma.fit(stjohns_temps)
cdf_gamma = gamma.cdf(bin_edges, gamma_shape, loc=gamma_loc, scale=gamma_scale)
expected_freq_gamma = np.diff(cdf_gamma) * len(stjohns_temps)
expected_freq_gamma *= observed_freq.sum() / expected_freq_gamma.sum()

# Perform the Chi-Square Goodness-of-Fit Test for both distributions
chi2_stat_normal, p_value_normal = chisquare(observed_freq, expected_freq_normal)
chi2_stat_gamma, p_value_gamma = chisquare(observed_freq, expected_freq_gamma)

# Display the results
print(f"Normal Distribution - Chi-Square Statistic: {chi2_stat_normal}, p-value: {p_value_normal}")
print(f"Gamma Distribution - Chi-Square Statistic: {chi2_stat_gamma}, p-value: {p_value_gamma}")

Normal Distribution - Chi-Square Statistic: 1444.307371201838, p-value: 4.684927642626088e-300
Gamma Distribution - Chi-Square Statistic: 1437.3617364506179, p-value: 1.4668461124952655e-298


In [6]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare, norm
import matplotlib.pyplot as plt

# Load and preprocess the data
climate_data = pd.read_csv('Canadian_climate_history.csv')
climate_data['LOCAL_DATE'] = pd.to_datetime(climate_data['LOCAL_DATE'], errors='coerce')
climate_data = climate_data.set_index('LOCAL_DATE')
stjohns_temps = climate_data['MEAN_TEMPERATURE_STJOHNS'].dropna()

# Define seasons: Winter (Dec-Feb), Spring (Mar-May), Summer (Jun-Aug), Fall (Sep-Nov)
seasons = {
    "Winter": stjohns_temps[(stjohns_temps.index.month == 12) | (stjohns_temps.index.month <= 2)],
    "Spring": stjohns_temps[(stjohns_temps.index.month >= 3) & (stjohns_temps.index.month <= 5)],
    "Summer": stjohns_temps[(stjohns_temps.index.month >= 6) & (stjohns_temps.index.month <= 8)],
    "Fall": stjohns_temps[(stjohns_temps.index.month >= 9) & (stjohns_temps.index.month <= 11)]
}

# Initialize results dictionary
results = {}

# Perform Chi-Square Goodness-of-Fit Test for each season
num_bins = 10  # Define number of bins for smoother results

for season, temps in seasons.items():
    # Observed frequencies
    observed_freq, bin_edges = np.histogram(temps, bins=num_bins)

    # Fit a normal distribution and calculate expected frequencies
    season_mean, season_std = norm.fit(temps)
    cdf_vals = norm.cdf(bin_edges, season_mean, season_std)
    expected_freq = np.diff(cdf_vals) * len(temps)
    expected_freq *= observed_freq.sum() / expected_freq.sum()  # Normalize frequencies

    # Perform the Chi-Square Goodness-of-Fit Test
    chi2_stat, p_value = chisquare(observed_freq, expected_freq)

    # Store results
    results[season] = {'Chi-Square Statistic': chi2_stat, 'p-value': p_value}

# Display the results for each season
for season, result in results.items():
    print(f"{season} - Chi-Square Statistic: {result['Chi-Square Statistic']}, p-value: {result['p-value']}")

Winter - Chi-Square Statistic: 17.29694107141906, p-value: 0.04426395170078908
Spring - Chi-Square Statistic: 88.54632183426334, p-value: 3.1853402009453265e-15
Summer - Chi-Square Statistic: 264.49405267426823, p-value: 8.641937271720324e-52
Fall - Chi-Square Statistic: 108.74598908472113, p-value: 2.6462923056958793e-19


In [8]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare, weibull_min, lognorm
import matplotlib.pyplot as plt

# Load and preprocess the data
climate_data = pd.read_csv('Canadian_climate_history.csv')
climate_data['LOCAL_DATE'] = pd.to_datetime(climate_data['LOCAL_DATE'], errors='coerce')
climate_data = climate_data.set_index('LOCAL_DATE')
stjohns_temps = climate_data['MEAN_TEMPERATURE_STJOHNS'].dropna()

# Define seasons: Winter (Dec-Feb), Spring (Mar-May), Summer (Jun-Aug), Fall (Sep-Nov)
seasons = {
    "Spring": stjohns_temps[(stjohns_temps.index.month >= 3) & (stjohns_temps.index.month <= 5)],
    "Summer": stjohns_temps[(stjohns_temps.index.month >= 6) & (stjohns_temps.index.month <= 8)],
    "Fall": stjohns_temps[(stjohns_temps.index.month >= 9) & (stjohns_temps.index.month <= 11)]
}

# Initialize results dictionary
results = {}

# Define number of bins
num_bins = 10
shift_value = abs(stjohns_temps.min()) + 1  # Shift to make all values positive

# Perform Chi-Square Goodness-of-Fit Test for shifted log-normal and Weibull distributions
for season, temps in seasons.items():
    # Observed frequencies
    observed_freq, bin_edges = np.histogram(temps, bins=num_bins)

    # Shifted Log-Normal distribution fitting and expected frequencies
    shifted_temps = temps + shift_value
    shape, loc, scale = lognorm.fit(shifted_temps, floc=0)  # floc=0 to fix location
    cdf_lognorm = lognorm.cdf(bin_edges + shift_value, shape, loc=loc, scale=scale)
    expected_freq_lognorm = np.diff(cdf_lognorm) * len(shifted_temps)
    expected_freq_lognorm *= observed_freq.sum() / expected_freq_lognorm.sum()  # Normalize frequencies

    # Weibull distribution fitting and expected frequencies
    c, loc, scale = weibull_min.fit(temps)
    cdf_weibull = weibull_min.cdf(bin_edges, c, loc=loc, scale=scale)
    expected_freq_weibull = np.diff(cdf_weibull) * len(temps)
    expected_freq_weibull *= observed_freq.sum() / expected_freq_weibull.sum()  # Normalize frequencies

    # Perform the Chi-Square Goodness-of-Fit Test for both distributions
    chi2_stat_lognorm, p_value_lognorm = chisquare(observed_freq, expected_freq_lognorm)
    chi2_stat_weibull, p_value_weibull = chisquare(observed_freq, expected_freq_weibull)

    # Store results
    results[season] = {
        'Shifted Log-Normal Chi-Square': chi2_stat_lognorm, 'Shifted Log-Normal p-value': p_value_lognorm,
        'Weibull Chi-Square': chi2_stat_weibull, 'Weibull p-value': p_value_weibull
    }

# Display the results for each season and distribution
for season, result in results.items():
    print(f"{season} - Shifted Log-Normal Chi-Square: {result['Shifted Log-Normal Chi-Square']}, p-value: {result['Shifted Log-Normal p-value']}")
    print(f"{season} - Weibull Chi-Square: {result['Weibull Chi-Square']}, p-value: {result['Weibull p-value']}")

Spring - Shifted Log-Normal Chi-Square: 18554.309404712047, p-value: 0.0
Spring - Weibull Chi-Square: 252.55526151872323, p-value: 2.8802355594216445e-49
Summer - Shifted Log-Normal Chi-Square: 565.494395806019, p-value: 5.297758309248969e-116
Summer - Weibull Chi-Square: 81.98030036029499, p-value: 6.526463864189011e-14
Fall - Shifted Log-Normal Chi-Square: 221.20607321022626, p-value: 1.1670439293628948e-42
Fall - Weibull Chi-Square: 64.48937665598817, p-value: 1.8132663532513147e-10


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt

# Load and preprocess the data
climate_data = pd.read_csv('Canadian_climate_history.csv')
climate_data['LOCAL_DATE'] = pd.to_datetime(climate_data['LOCAL_DATE'], errors='coerce')
climate_data = climate_data.set_index('LOCAL_DATE')
stjohns_temps = climate_data['MEAN_TEMPERATURE_STJOHNS'].dropna()

# Define a dictionary to store results for each month
results = {}

# Perform Chi-Square Goodness-of-Fit Test using KDE for each month
for month in range(1, 13):
    # Extract temperature data for each month
    monthly_temps = stjohns_temps[stjohns_temps.index.month == month]

    # Observed frequencies
    num_bins = 10
    observed_freq, bin_edges = np.histogram(monthly_temps, bins=num_bins)

    # KDE to estimate the density
    kde = KernelDensity(bandwidth=1.0, kernel='gaussian')
    kde.fit(monthly_temps.values.reshape(-1, 1))

    # Generate expected frequencies based on KDE
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    log_densities = kde.score_samples(bin_centers.reshape(-1, 1))
    expected_freq = np.exp(log_densities) * len(monthly_temps) * (bin_edges[1] - bin_edges[0])
    expected_freq *= observed_freq.sum() / expected_freq.sum()  # Normalize frequencies

    # Perform the Chi-Square Goodness-of-Fit Test
    chi2_stat, p_value = chisquare(observed_freq, expected_freq)

    # Store results
    results[f"Month {month}"] = {'Chi-Square Statistic': chi2_stat, 'p-value': p_value}

# Display the results for each month
for month, result in results.items():
    print(f"{month} - Chi-Square Statistic: {result['Chi-Square Statistic']}, p-value: {result['p-value']}")

Month 1 - Chi-Square Statistic: 3.902590857314743, p-value: 0.9177058916584641
Month 2 - Chi-Square Statistic: 1.8038407792149909, p-value: 0.994203394000153
Month 3 - Chi-Square Statistic: 1.6430617331293145, p-value: 0.9959397890339731
Month 4 - Chi-Square Statistic: 4.6990698742020935, p-value: 0.8597127426519187
Month 5 - Chi-Square Statistic: 6.64608438811871, p-value: 0.6739127181453697
Month 6 - Chi-Square Statistic: 5.347213585136231, p-value: 0.8030515863664108
Month 7 - Chi-Square Statistic: 8.567278165804748, p-value: 0.47813701877192316
Month 8 - Chi-Square Statistic: 7.103638648091529, p-value: 0.6263299439975035
Month 9 - Chi-Square Statistic: 2.124620992612135, p-value: 0.9893399695472903
Month 10 - Chi-Square Statistic: 3.6344106157856517, p-value: 0.9337877669159877
Month 11 - Chi-Square Statistic: 2.209588535007845, p-value: 0.9877030107772983
Month 12 - Chi-Square Statistic: 8.624608934460445, p-value: 0.472621984150961
