In [1]:
import pandas as pd
import numpy as np
from scipy.stats import t, pearsonr
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import chisquare, poisson 

In [2]:
df = pd.read_csv('./Datasets/Hurricane.csv')
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,"August, September",150,924
1,"""1856 Last Island Hurricane""",1856,August,150,934
2,Hurricane #6,1866,"September, October",140,938
3,Hurricane #7,1878,"September, October",140,938
4,Hurricane #2,1880,August,150,931


In [3]:
df.shape

(101, 5)

In [4]:
df = df.assign(Month=df['Month'].str.split(', ')).explode('Month').reset_index(drop=True)
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,August,150,924
1,Hurricane #3,1853,September,150,924
2,"""1856 Last Island Hurricane""",1856,August,150,934
3,Hurricane #6,1866,September,140,938
4,Hurricane #6,1866,October,140,938


In [5]:
df.shape

(137, 5)

In [6]:
month_mapping = {
    'July' : 0,
    'August': 1,
    'September': 2,
    'October': 3,
    'November': 4,
    'December': 5
}

# Create a new column with the integer mapping
df['Month'] = df['Month'].map(month_mapping)

In [7]:
df['Month'].value_counts()

Month
2    68
1    35
3    30
4     3
0     1
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938


In [9]:
df

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938
...,...,...,...,...,...
132,Hurricane Fabian,2003,2,145,939
133,Hurricane Charley,2004,1,150,941
134,Hurricane Frances,2004,1,145,935
135,Hurricane Frances,2004,2,145,935


a. With a 1% level of significance conduct t-test for correlation coefficient between “Max. sustained winds(mph)” and “Minimum pressure (mbar)”.

In [10]:
alpha = 0.01
n = df.shape[0]
r = np.corrcoef(df["Max. sustained winds(mph)"], df["Minimum pressure(mbar)"])[0, 1]
t_value = r / np.sqrt((1 - r**2) / (n - 2))
t_value

-6.06772854436473

In [11]:
r

-0.4629058408860278

In [12]:
degrees = n-2

critical_value = t.ppf(1 - alpha/2, degrees)

# Print the critical value
print(f"The critical value is: {critical_value:.3f}")

The critical value is: 2.613


In [13]:
if (t_value < critical_value):
    print("Rejecting the NULL Hypothesis")
else:
    print("Failed to reject the NULL Hypothesis")

Rejecting the NULL Hypothesis


In [14]:
test_result = pearsonr(df["Max. sustained winds(mph)"], df["Minimum pressure(mbar)"])
test_result

PearsonRResult(statistic=-0.46290584088602715, pvalue=1.2303904634740846e-08)

In [15]:
test_result.pvalue

1.2303904634740846e-08

In [16]:
test_result.confidence_interval()

ConfidenceInterval(low=-0.5851892925012568, high=-0.3200368904005088)

In [17]:
# Extract relevant columns
winds = df['Max. sustained winds(mph)']
pressure = df['Minimum pressure(mbar)']

# Calculate Pearson correlation
correlation, p_value = pearsonr(winds, pressure)

# Display the result
print(f"Pearson correlation: {correlation}")
print(f"P-value: {p_value}")

# Test the significance at 1% level
if p_value < 0.01:
    print("Reject the null hypothesis: Significant correlation.")
else:
    print("Fail to reject the null hypothesis: No significant correlation.")


Pearson correlation: -0.46290584088602715
P-value: 1.2303904634740846e-08
Reject the null hypothesis: Significant correlation.


In [18]:
df.drop(columns=['Name', 'Month']).corr()['Max. sustained winds(mph)'].sort_values(ascending=False)

Max. sustained winds(mph)    1.000000
Season                       0.066874
Minimum pressure(mbar)      -0.462906
Name: Max. sustained winds(mph), dtype: float64

b. With a 5% level of significance test if the “Max. sustained winds(mph)” of hurricane depends on the month of its occurrence.

In [19]:
df

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938
...,...,...,...,...,...
132,Hurricane Fabian,2003,2,145,939
133,Hurricane Charley,2004,1,150,941
134,Hurricane Frances,2004,1,145,935
135,Hurricane Frances,2004,2,145,935


In [20]:
df

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938
...,...,...,...,...,...
132,Hurricane Fabian,2003,2,145,939
133,Hurricane Charley,2004,1,150,941
134,Hurricane Frances,2004,1,145,935
135,Hurricane Frances,2004,2,145,935


In [21]:
df['Month'].value_counts

<bound method IndexOpsMixin.value_counts of 0      1
1      2
2      1
3      2
4      3
      ..
132    2
133    1
134    1
135    2
136    2
Name: Month, Length: 137, dtype: int64>

In [22]:
df

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938
...,...,...,...,...,...
132,Hurricane Fabian,2003,2,145,939
133,Hurricane Charley,2004,1,150,941
134,Hurricane Frances,2004,1,145,935
135,Hurricane Frances,2004,2,145,935


In [23]:
# Calculate correlation coefficient r
r = np.corrcoef(df['Max. sustained winds(mph)'],df['Month'] )[0, 1]
alpha = 0.05
# Calculate the t-value
n = df.shape[0]
t_value = r * np.sqrt((n - 2) / (1 - r**2))
degrees = n-2
# Calculate the critical value
critical_value = t.ppf(1 - alpha/2, degrees)

# Output results
print(f"Correlation coefficient (r): {r}")
print(f"Calculated t-value: {t_value}")
print(f"Critical t-value at 5% significance level: {critical_value}")

# Decision
if  abs(t_value) > critical_value:
    print("Reject the null hypothesis: Max sustained winds depend on the month of occurrence.")
else:
    print("Fail to reject the null hypothesis: No significant dependence on the month.")

Correlation coefficient (r): 0.00900011347953533
Calculated t-value: 0.10457610438545846
Critical t-value at 5% significance level: 1.977692277222804
Fail to reject the null hypothesis: No significant dependence on the month.


In [24]:
degrees = n-2

critical_value = t.ppf(1 - alpha/2, degrees)

# Print the critical value
print(f"The critical value is: {critical_value:.3f}")

The critical value is: 1.978


In [25]:
if (t_value < critical_value):
    print("Rejecting the NULL Hypothesis")
else:
    print("Failed to reject the NULL Hypothesis")

Rejecting the NULL Hypothesis


c. With a 10% level of significance conduct test if “Max. sustained winds(mph)” follows a Poisson distribution.

In [31]:
import pandas as pd
from scipy.stats import poisson, chisquare

# Assuming df is already defined and contains 'Max. sustained winds(mph)'
winds = df['Max. sustained winds(mph)']

mean_wind = winds.mean()
print(mean_wind)
# Generate observed frequencies (assuming wind speeds are integers)
observed_freq = winds.value_counts().sort_index()
print("Observed Frequencies:\n", observed_freq)

# Generate Poisson probabilities for the observed values
total = len(winds)
expected_freq = [poisson.pmf(k, mean_wind)  for k in observed_freq.index]
total2 = sum(expected_freq)
expected_freq = [prob * total/total2 for prob in expected_freq]
print("Expected Frequencies:\n", expected_freq)

# Check sums
print("Sum of Observed Frequencies:", observed_freq.sum())
print("Sum of Expected Frequencies:", sum(expected_freq))

# Ensure lengths are the same
if len(observed_freq) != len(expected_freq):
    print("Length mismatch: Adjust expected frequencies to match observed frequencies.")
else:
    chi_square_stat, p_value = chisquare(f_obs=observed_freq, f_exp=expected_freq)

    print(f"Chi-square statistic: {chi_square_stat}")
    print(f"P-value: {p_value}")

    # Check significance at 10% level
    if p_value < 0.10:
        print("Reject the null hypothesis: Winds do not follow a Poisson distribution.")
    else:
        print("Fail to reject the null hypothesis: Winds follow a Poisson distribution.")


142.33576642335765
Observed Frequencies:
 Max. sustained winds(mph)
130    28
140    39
145    33
150    24
155    13
Name: count, dtype: int64
Expected Frequencies:
 [21.14459376472458, 34.66753441054795, 33.87811106758004, 27.879178668675344, 19.430582088472068]
Sum of Observed Frequencies: 137
Sum of Expected Frequencies: 137.0
Chi-square statistic: 5.4547959472372565
P-value: 0.2437316198723859
Fail to reject the null hypothesis: Winds follow a Poisson distribution.


In [34]:
import pandas as pd
from scipy.stats import poisson,chi2
import numpy as np

# Assuming df is already defined and contains 'Max. sustained winds(mph)'
winds = df['Max. sustained winds(mph)']

mean_wind = winds.mean()

# Generate observed frequencies (assuming wind speeds are integers)
observed_freq = winds.value_counts().sort_index()
print("Observed Frequencies:\n", observed_freq)

# Generate Poisson probabilities for the observed values
total = len(winds)
expected_freq = [poisson.pmf(k, mean_wind) * total for k in observed_freq.index]

# Ensure expected_freq is a Series with the same index as observed_freq
expected_freq_series = pd.Series(expected_freq, index=observed_freq.index).fillna(0)

print("Expected Frequencies:\n", expected_freq_series)

# Check sums
print("Sum of Observed Frequencies:", observed_freq.sum())
print("Sum of Expected Frequencies:", expected_freq_series.sum())

# Calculate chi-square statistic from scratch
chi_square_stat = np.sum((observed_freq - expected_freq_series) ** 2 / expected_freq_series)

print(f"Chi-square statistic (manual calculation): {chi_square_stat}")

# Degrees of freedom
degrees_of_freedom = len(winds.value_counts()) - 1
alpha = 0.10
critical_value = chi2.ppf(1 - alpha, degrees_of_freedom)
print(f"Critical value at {alpha*100}% significance level: {critical_value}")  #change

# Calculate p-value using the chi-square distribution
from scipy.stats import chi2
p_value = 1 - chi2.cdf(chi_square_stat, degrees_of_freedom)

print(f"P-value (manual calculation): {p_value}")

# Check significance at 10% level
if p_value < 0.10:
    print("Reject the null hypothesis: Winds do not follow a Poisson distribution.")
else:
    print("Fail to reject the null hypothesis: Winds follow a Poisson distribution.")


Observed Frequencies:
 Max. sustained winds(mph)
130    28
140    39
145    33
150    24
155    13
Name: count, dtype: int64
Expected Frequencies:
 Max. sustained winds(mph)
130    2.761946
140    4.528338
145    4.425222
150    3.641630
155    2.538059
dtype: float64
Sum of Observed Frequencies: 137
Sum of Expected Frequencies: 17.895195365650657
Chi-square statistic (manual calculation): 834.4844091739852
Critical value at 10.0% significance level: 7.779440339734858
P-value (manual calculation): 0.0
Reject the null hypothesis: Winds do not follow a Poisson distribution.
