In [126]:
import pandas as pd
import numpy as np
from scipy.stats import t, pearsonr
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import chisquare, poisson 

In [127]:
df = pd.read_csv('./Datasets/Hurricane.csv')
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,"August, September",150,924
1,"""1856 Last Island Hurricane""",1856,August,150,934
2,Hurricane #6,1866,"September, October",140,938
3,Hurricane #7,1878,"September, October",140,938
4,Hurricane #2,1880,August,150,931


In [128]:
df.shape

(101, 5)

In [129]:
df = df.assign(Month=df['Month'].str.split(', ')).explode('Month').reset_index(drop=True)
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,August,150,924
1,Hurricane #3,1853,September,150,924
2,"""1856 Last Island Hurricane""",1856,August,150,934
3,Hurricane #6,1866,September,140,938
4,Hurricane #6,1866,October,140,938


In [130]:
df_normalize = df.copy()

In [131]:
df.shape

(137, 5)

In [132]:
df_normalize.shape

(137, 5)

In [133]:
from sklearn.preprocessing import LabelEncoder

In [134]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to the 'Month' column
df_normalize['Month'] = label_encoder.fit_transform(df_normalize['Month'])

In [135]:
df_normalize.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,0,150,924
1,Hurricane #3,1853,4,150,924
2,"""1856 Last Island Hurricane""",1856,0,150,934
3,Hurricane #6,1866,4,140,938
4,Hurricane #6,1866,3,140,938


In [136]:
def mean_of_data(column):
    total = 0
    for i in column:
        total += i
    return total/len(column)

def variance_of_data(column):
    mean = mean_of_data(column)
    total = 0
    for i in column:
        total += (i - mean)**2
    return total/len(column)

In [137]:
def normalize_feature():
    for column in df_normalize.columns[1:]:
        mean = mean_of_data(df_normalize[column])
        variance = variance_of_data(df_normalize[column])
        standard_dev = variance**0.5
        df_normalize[column] = (df_normalize[column] - mean) / standard_dev

In [138]:
normalize_feature()

In [139]:
df

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,August,150,924
1,Hurricane #3,1853,September,150,924
2,"""1856 Last Island Hurricane""",1856,August,150,934
3,Hurricane #6,1866,September,140,938
4,Hurricane #6,1866,October,140,938
...,...,...,...,...,...
132,Hurricane Fabian,2003,September,145,939
133,Hurricane Charley,2004,August,150,941
134,Hurricane Frances,2004,August,145,935
135,Hurricane Frances,2004,September,145,935


In [140]:
df_normalize

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,-2.471014,-1.623394,0.990500,-1.487932
1,Hurricane #3,-2.471014,0.787500,0.990500,-1.487932
2,"""1856 Last Island Hurricane""",-2.395179,-1.623394,0.990500,-0.482771
3,Hurricane #6,-2.142394,0.787500,-0.301867,-0.080706
4,Hurricane #6,-2.142394,0.184777,-0.301867,-0.080706
...,...,...,...,...,...
132,Hurricane Fabian,1.320753,0.787500,0.344317,0.019810
133,Hurricane Charley,1.346031,-1.623394,0.990500,0.220842
134,Hurricane Frances,1.346031,-1.623394,0.344317,-0.382255
135,Hurricane Frances,1.346031,0.787500,0.344317,-0.382255


a. With a 1% level of significance conduct t-test for correlation coefficient between “Max. sustained winds(mph)” and “Minimum pressure (mbar)”.

In [141]:
df_normalize

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,-2.471014,-1.623394,0.990500,-1.487932
1,Hurricane #3,-2.471014,0.787500,0.990500,-1.487932
2,"""1856 Last Island Hurricane""",-2.395179,-1.623394,0.990500,-0.482771
3,Hurricane #6,-2.142394,0.787500,-0.301867,-0.080706
4,Hurricane #6,-2.142394,0.184777,-0.301867,-0.080706
...,...,...,...,...,...
132,Hurricane Fabian,1.320753,0.787500,0.344317,0.019810
133,Hurricane Charley,1.346031,-1.623394,0.990500,0.220842
134,Hurricane Frances,1.346031,-1.623394,0.344317,-0.382255
135,Hurricane Frances,1.346031,0.787500,0.344317,-0.382255


In [142]:
alpha = 0.01
n = df_normalize.shape[0]
r = np.corrcoef(df_normalize["Max. sustained winds(mph)"], df_normalize["Minimum pressure(mbar)"])[0, 1]
t_value = r / np.sqrt((1 - r**2) / (n - 2))
t_value

-6.067728544364718

In [143]:
# Calculate degrees of freedom
degrees = n - 2

# Two-tailed p-value
p_value = 2 * t.cdf(-np.abs(t_value), degrees)
p_value

1.230390463474083e-08

In [144]:
p_value < alpha

True

In [145]:
alpha = 0.01
n = df.shape[0]
r = np.corrcoef(df["Max. sustained winds(mph)"], df["Minimum pressure(mbar)"])[0, 1]
t_value = r / np.sqrt((1 - r**2) / (n - 2))
t_value

-6.067728544364715

In [146]:
# Calculate degrees of freedom
degrees = n - 2

# Two-tailed p-value
p_value2 = 2 * t.cdf(-np.abs(t_value), degrees)
p_value2

1.2303904634741104e-08

In [147]:
test_result = pearsonr(df["Max. sustained winds(mph)"], df["Minimum pressure(mbar)"])
test_result

PearsonRResult(statistic=-0.4629058408860275, pvalue=1.2303904634740368e-08)

In [148]:
test_result.pvalue

1.2303904634740368e-08

In [149]:
test_result.confidence_interval()

ConfidenceInterval(low=-0.5851892925012572, high=-0.3200368904005092)

In [150]:
# Extract relevant columns
winds = df['Max. sustained winds(mph)']
pressure = df['Minimum pressure(mbar)']

# Calculate Pearson correlation
correlation, p_value = pearsonr(winds, pressure)

# Display the result
print(f"Pearson correlation: {correlation}")
print(f"P-value: {p_value}")

# Test the significance at 1% level
if p_value < 0.01:
    print("Reject the null hypothesis: Significant correlation.")
else:
    print("Fail to reject the null hypothesis: No significant correlation.")


Pearson correlation: -0.4629058408860275
P-value: 1.2303904634740368e-08
Reject the null hypothesis: Significant correlation.


In [151]:
df.drop(columns=['Name', 'Month']).corr()['Max. sustained winds(mph)'].sort_values(ascending=False)

Max. sustained winds(mph)    1.000000
Season                       0.066874
Minimum pressure(mbar)      -0.462906
Name: Max. sustained winds(mph), dtype: float64

b. With a 5% level of significance test if the “Max. sustained winds(mph)” of hurricane depends on the month of its occurrence.

In [152]:
# Extract relevant columns
winds = df_normalize['Max. sustained winds(mph)']
month = df_normalize['Month']

# Calculate Pearson correlation
correlation, p_value = pearsonr(winds, month)

# Display the result
print(f"Pearson correlation: {correlation}")
print(f"P-value: {p_value}")

# Test the significance at 1% level
if p_value < 0.05:
    print("Reject the null hypothesis: Significant correlation.")
else:
    print("Fail to reject the null hypothesis: No significant correlation.")


Pearson correlation: -0.06362164333733303
P-value: 0.4601512540636204
Fail to reject the null hypothesis: No significant correlation.


In [153]:
model = ols('Q("Max. sustained winds(mph)") ~ C(Month)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

In [154]:
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Month),94.000823,4.0,0.382562,0.820795
Residual,8108.553922,132.0,,


In [155]:
p_value = anova_table['PR(>F)'][0]
print(p_value)
if p_value < 0.05:
    print("Reject the null hypothesis: Winds depend on the month.")
else:
    print("Fail to reject the null hypothesis: Winds do not depend on the month.")

0.8207947187971213
Fail to reject the null hypothesis: Winds do not depend on the month.


c. With a 10% level of significance conduct test if “Max. sustained winds(mph)” follows a Poisson distribution.

In [156]:
df.shape

(137, 5)

In [157]:
df.head()

Unnamed: 0,Name,Season,Month,Max. sustained winds(mph),Minimum pressure(mbar)
0,Hurricane #3,1853,August,150,924
1,Hurricane #3,1853,September,150,924
2,"""1856 Last Island Hurricane""",1856,August,150,934
3,Hurricane #6,1866,September,140,938
4,Hurricane #6,1866,October,140,938


In [158]:
winds = df['Max. sustained winds(mph)']

mean_wind = winds.mean()

# Generate observed frequencies (assuming wind speeds are integers)
observed_freq = winds.value_counts().sort_index()

# Generate Poisson probabilities for the observed values
expected_freq = [poisson.pmf(k, mean_wind) for k in observed_freq.index]
total = sum(expected_freq)
expected_freq =  [poisson.pmf(k, mean_wind)*137/total for k in observed_freq.index]
# expected_freq
chi_square_stat, p_value = chisquare(f_obs=observed_freq, f_exp=expected_freq)

print(f"Chi-square statistic: {chi_square_stat}")
print(f"P-value: {p_value}")

# Check significance at 10% level
if p_value < 0.10:
    print("Reject the null hypothesis: Winds do not follow a Poisson distribution.")
else:
    print("Fail to reject the null hypothesis: Winds follow a Poisson distribution.")

Chi-square statistic: 5.4547959472372565
P-value: 0.2437316198723859
Fail to reject the null hypothesis: Winds follow a Poisson distribution.
