In [28]:
import pandas as pd
import numpy as np
from scipy.stats import t, pearsonr
from scipy.stats import chisquare, poisson 
import pandas as pd
from scipy.stats import poisson, chisquare
import pandas as pd
from scipy.stats import poisson,chi2
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

#### Data Preprocessing

In [29]:
'''
Reading the dataset
'''

df = pd.read_csv('./Datasets/Hurricane.csv')
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,"August, September",150,924
1,"""1856 Last Island Hurricane""",1856,August,150,934
2,Hurricane #6,1866,"September, October",140,938
3,Hurricane #7,1878,"September, October",140,938
4,Hurricane #2,1880,August,150,931


In [30]:
df.shape

(101, 5)

In [31]:
'''
Month column conatinded multiple values separated by comma. We need to split the values and create a new row for each value.
'''

df = df.assign(Month=df['Month'].str.split(', ')).explode('Month').reset_index(drop=True)
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,August,150,924
1,Hurricane #3,1853,September,150,924
2,"""1856 Last Island Hurricane""",1856,August,150,934
3,Hurricane #6,1866,September,140,938
4,Hurricane #6,1866,October,140,938


In [32]:
df.shape

(137, 5)

In [33]:
df_no_mapped = df.copy()

In [34]:
'''
Mapping the month names to numbers
'''

month_mapping = {
    'July' : 0,
    'August': 1,
    'September': 2,
    'October': 3,
    'November': 4,
    'December': 5
}

df['Month'] = df['Month'].map(month_mapping)

In [35]:
df['Month'].value_counts()

Month
2    68
1    35
3    30
4     3
0     1
Name: count, dtype: int64

In [36]:
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938


a. With a 1% level of significance conduct t-test for correlation coefficient between “Max. sustained winds(mph)” and “Minimum pressure (mbar)”.

Scratch Implementation

In [37]:
alpha = 0.01
n = df.shape[0]
r = np.corrcoef(df["Max. sustained winds(mph)"], df["Minimum pressure(mbar)"])[0, 1]
t_value = r / np.sqrt((1 - r**2) / (n - 2))

In [38]:
degrees_of_freedom = n-2

critical_value = t.ppf(1 - alpha/2, degrees_of_freedom)

print(f"Correlation Coeffient :{r}")
print(f"T value: {t_value}")
print(f"The critical value is: {critical_value}")

Correlation Coeffient :-0.4629058408860278
T value: -6.06772854436473
The critical value is: 2.612737907503725


We define our hypothesis as follows: <br>
- H0: ρ = 0 (There is no correlation between the maximum sustained winds and the minimum pressure) <br>
- H1: ρ != 0 (There is correlation between the maximum sustained winds and the minimum pressure)

In [39]:
if (t_value < critical_value):
    print("Rejecting the NULL Hypothesis: There is significant correlation")
else:
    print("Failed to reject the NULL Hypothesis: There is no significant correlation")

Rejecting the NULL Hypothesis: There is significant correlation


Verifying using inbuilt libraries

In [40]:
winds = df['Max. sustained winds(mph)']
pressure = df['Minimum pressure(mbar)']

correlation, p_value = pearsonr(winds, pressure)

print(f"Pearson correlation: {correlation}")
print(f"P-value: {p_value}")

if p_value < 0.01:
    print("Reject the null hypothesis: Significant correlation.")
else:
    print("Fail to reject the null hypothesis: No significant correlation.")


Pearson correlation: -0.46290584088602715
P-value: 1.2303904634740846e-08
Reject the null hypothesis: Significant correlation.


b. With a 5% level of significance test if the “Max. sustained winds(mph)” of hurricane depends on the month of its occurrence.

We define our hypothesis as follows: <br>
- H0: ρ = 0 (There is no correlation between the maximum sustained winds and the month of occurence) <br>
- H1: ρ != 0 (There is correlation between the maximum sustained winds and the month of occurence)

We perform ANOVA test on our original dataset, where the months are not mapped to integers. This is because month is categorical and wind speed is numeric. For testing statistical dependence between a qualitative and quantitative feature, we use ANOVA test.

<b>Reference: </b>
- https://towardsdatascience.com/every-statistical-test-to-check-feature-dependence-773a21cd6722

In [41]:
# Conduct ANOVA test
model = ols('Q("Max. sustained winds(mph)") ~ C(Month)', data=df_no_mapped).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

# Check p-value at 5% significance level
p_value = anova_table['PR(>F)'][0]
if p_value < 0.05:
    print("Reject the null hypothesis: Max sustained winds depend on the month of occurrence.")
else:
    print("Fail to reject the null hypothesis: No significant dependence on the month.")

               sum_sq     df         F    PR(>F)
C(Month)    94.000823    4.0  0.382562  0.820795
Residual  8108.553922  132.0       NaN       NaN
Fail to reject the null hypothesis: No significant dependence on the month.


In [42]:
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,1,150,924
1,Hurricane #3,1853,2,150,924
2,"""1856 Last Island Hurricane""",1856,1,150,934
3,Hurricane #6,1866,2,140,938
4,Hurricane #6,1866,3,140,938


Now, we perform t-test on our data where the months were mapped to integers. This is because t-test is used to test statistical dependence between two quantitative features. After mapping to integers, we convert months, a qualitative feature to a quantitative feature.

In [43]:
r = np.corrcoef(df['Max. sustained winds(mph)'],df['Month'] )[0, 1]

alpha = 0.05
n = df.shape[0]
t_value = r * np.sqrt((n - 2) / (1 - r**2))
degrees = n-2


critical_value = t.ppf(1 - alpha/2, degrees)

print(f"Correlation coefficient (r): {r}")
print(f"Calculated t-value: {t_value}")
print(f"Critical t-value at 5% significance level: {critical_value}")

if  (abs(t_value) > critical_value):
    print("Reject the null hypothesis: Max sustained winds depend on the month of occurrence.")
else:
    print("Fail to reject the null hypothesis: No significant dependence on the month.")

Correlation coefficient (r): 0.00900011347953533
Calculated t-value: 0.10457610438545846
Critical t-value at 5% significance level: 1.977692277222804
Fail to reject the null hypothesis: No significant dependence on the month.


c. With a 10% level of significance conduct test if “Max. sustained winds(mph)” follows a Poisson distribution.

<b> Scaling the expected frequencies for chi square test </b>

The inbuilt function `scipy.stats.chisquare` performs the chi-square test on a vector of observed and expected frequencies. However, for the test to work, the observed and expected frequencies must have nearly the same sum otherwise, it will raise an error due to incompatible inputs. To solve this, we rescaled the sum of expected frequencies so that they are nearly equal to the sum of the observed frequencies and then did the further calculations.

Reference:
- [Scipy Github Issues](https://github.com/scipy/scipy/issues/12282)

In [44]:
winds = df['Max. sustained winds(mph)']

mean_wind = winds.mean()
print("Lambda :"  , mean_wind)
print()

observed_freq = winds.value_counts().sort_index()
print("Observed Frequencies:\n", observed_freq)
print()
total = len(winds)
expected_freq = [poisson.pmf(k, mean_wind) for k in observed_freq.index]
total2 = sum(expected_freq)
expected_freq = [prob * total/total2 for prob in expected_freq]
print("Expected Frequencies:\n", expected_freq)
print()
# Checking if the sums of observed and expected frequencies are the same
print("Sum of Observed Frequencies:", observed_freq.sum())
print("Sum of Expected Frequencies:", sum(expected_freq))
print()
chi_square_stat, p_value = chisquare(f_obs=observed_freq, f_exp=expected_freq)

print(f"Chi-square statistic: {chi_square_stat}")
print(f"P-value: {p_value}")
print()

if p_value < 0.10:
    print("Reject the null hypothesis: Winds do not follow a Poisson distribution.")
else:
    print("Fail to reject the null hypothesis: Winds follow a Poisson distribution.")


Lambda : 142.33576642335765

Observed Frequencies:
 Max. sustained winds(mph)
130    28
140    39
145    33
150    24
155    13
Name: count, dtype: int64

Expected Frequencies:
 [21.14459376472458, 34.66753441054795, 33.87811106758004, 27.879178668675344, 19.430582088472068]

Sum of Observed Frequencies: 137
Sum of Expected Frequencies: 137.0

Chi-square statistic: 5.4547959472372565
P-value: 0.2437316198723859

Fail to reject the null hypothesis: Winds follow a Poisson distribution.


<b> Without Scaling the observed frequencies for chi square test </b>

In [45]:
winds = df['Max. sustained winds(mph)']
mean_wind = winds.mean()
print("Lambda : ", mean_wind)
observed_freq = winds.value_counts().sort_index()
print("Observed Frequencies:\n", observed_freq)
print()

total = len(winds)
expected_freq = [poisson.pmf(k, mean_wind) * total for k in observed_freq.index]

expected_freq_series = pd.Series(expected_freq, index=observed_freq.index).fillna(0)

print("Expected Frequencies:\n", expected_freq_series)
print()
print("Sum of Observed Frequencies:", observed_freq.sum())
print("Sum of Expected Frequencies:", expected_freq_series.sum())

# Calculating chi-square statistic from scratch
chi_square_stat = np.sum((observed_freq - expected_freq_series) ** 2 / expected_freq_series)

print(f"Chi-square statistic (manual calculation): {chi_square_stat}")

# Degrees of freedom
degrees_of_freedom = len(winds.value_counts()) - 1
alpha = 0.10
critical_value = chi2.ppf(1 - alpha, degrees_of_freedom)
print(f"Critical value at {alpha*100}% significance level: {critical_value}")  #change

# Calculating p-value using the chi-square distribution
p_value = 1 - chi2.cdf(chi_square_stat, degrees_of_freedom)

print(f"P-value (manual calculation): {p_value}")
print()
if p_value < 0.10:
    print("Reject the null hypothesis: Winds do not follow a Poisson distribution.")
else:
    print("Fail to reject the null hypothesis: Winds follow a Poisson distribution.")


Lambda :  142.33576642335765
Observed Frequencies:
 Max. sustained winds(mph)
130    28
140    39
145    33
150    24
155    13
Name: count, dtype: int64

Expected Frequencies:
 Max. sustained winds(mph)
130    2.761946
140    4.528338
145    4.425222
150    3.641630
155    2.538059
dtype: float64

Sum of Observed Frequencies: 137
Sum of Expected Frequencies: 17.895195365650657
Chi-square statistic (manual calculation): 834.4844091739852
Critical value at 10.0% significance level: 7.779440339734858
P-value (manual calculation): 0.0

Reject the null hypothesis: Winds do not follow a Poisson distribution.
