# Hypothesis Testing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Hypothesis Testing on recycled waste

In [None]:
df = pd.read_csv('../data/cleaned_data/cleaned_data_final.csv')
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df_grouped = df.groupby('location').agg({'total_waste':'sum', 'recycled_waste':'sum'}).reset_index()
df_grouped

In [None]:
df['recyc_waste_ratio'] = df['recycled_waste']/ df['total_waste']

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.scatterplot(df, y='recycled_waste', x='total_waste', ax=ax[0])
plt.xlabel('Total Waste')
plt.ylabel('Total Recycled')
plt.title('Scatterplot Total Recycled to Total Waste')
plt.tight_layout()

sns.scatterplot(df, y='recyc_waste_ratio', x='total_waste', ax=ax[1])
plt.xlabel('Total Waste')
plt.ylabel('Total Recycled')
plt.title('Scatterplot Total Recycled to Total Waste')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.boxplot(df, y='total_waste', ax=ax[0])
sns.boxplot(df, y='recycled_waste',ax=ax[1])
plt.show()

In [None]:
corr_matrix = df[['recycled_waste','total_waste', 'recyc_waste_ratio']].corr()
corr_matrix

In [None]:
#If skewness is less than -1 or greater than 1, the distribution is highly skewed and needs transforming.
df[['recycled_waste','total_waste', 'recyc_waste_ratio']].skew()

In [None]:
import pickle
from sklearn.preprocessing import PowerTransformer

df2 = df[['recycled_waste','total_waste', 'recyc_waste_ratio']]

transformer = PowerTransformer(method='yeo-johnson')

transformed = transformer.fit_transform(df2)
df_transformed = pd.DataFrame(transformed, columns=df2.columns)

path = "./transformers/"
file_name = "transf_recycled_total_waste.pkl"

with open(path + file_name, "wb") as file:
    pickle.dump(transformer, file)

In [None]:
df_transformed.describe()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.scatterplot(df_transformed, y='recycled_waste', x='total_waste', ax=ax[0])
plt.xlabel('Total Waste')
plt.ylabel('Total Recycled')
plt.title('Scatterplot Total Recycled to Total Waste')
plt.tight_layout()

sns.scatterplot(df_transformed, y='recyc_waste_ratio', x='total_waste', ax=ax[1])
plt.xlabel('Total Waste')
plt.ylabel('Total Recycled')
plt.title('Scatterplot Total Recycled to Total Waste')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.boxplot(df_transformed, y='total_waste', ax=ax[0])
sns.boxplot(df_transformed, y='recycled_waste',ax=ax[1])
plt.show()

### Hypothesis Test if recycled waste is dependent on total waste

H0: Recycled waste is not dependent on total waste


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

X = df_transformed[['total_waste']]
y = df_transformed['recyc_waste_ratio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

path = "./models/"
file_name = "lr_recycled_waste_total_waste.pkl"

with open(path + file_name, "wb") as file:
    pickle.dump(model, file)

In [None]:
#Statistical Testing:

In [None]:
print(f'Coefficient for total_waste: {model.coef_[0]}')

In [None]:
from scipy.stats import linregress

slope, intercept, r_value, p_value, std_err = linregress(X_train['total_waste'], y_train)
print(f'P-value: {p_value}')

In [None]:
print(model.coef_[0])
p_value<model.coef_[0]

In [None]:
# Testing if the H0 needs to be rejected and determine if the overall model is statistically significant
if p_value<0.05:
    print("Reject H0")
else: print("Don't reject H0")

The coefficient of 0.00046 for 'total_waste' means that for every one-unit increase in the 'total_waste' variable, the predicted value of the response variable (recycling) increases by approximately 0.2887 units. 

The p-value being very close to zero suggests that this relationship is statistically significant, supporting the idea that higher total waste corresponds to increased recycling.

The significance level of 0.05 indicates strong evidence against the null hypothesis. This means the null hypothesis is rejected in favor of the alternative hypothesis. Essentially, it supports the claim that there is a relationship between the 'total_waste' variable and the recycling.

### Hypothesis testing if the recycled amount varies across OECD and non OECD countries

HO: Recycled-waste-ratio is not significantly higher in OECD countries in comparison to non OECD countries.

In [None]:
no_oecd_member = ['China', 'India', 'Latin America', 'Middle East & North Africa', 'Other Africa', 'Other EU', 'Other Eurasia', 'Other non-OECD Asia']

In [None]:
# Function to map locations to OECD or non-OECD
def assign_oecd_status(location):
    if location in no_oecd_member:
        return 'no OECD'
    else:
        return 'OECD'

# Create a new column 'OECD_Status' based on the location column
df['oecd_status'] = df['location'].apply(lambda x: assign_oecd_status(x))

In [None]:
df_oecd = df[(df['oecd_status']=='OECD')]
df_non_oecd = df[(df['oecd_status']!='OECD')]

# no transformation needed here. skew was in range
df_oecd = df_oecd['recyc_waste_ratio']
df_non_oecd = df_non_oecd['recyc_waste_ratio']

In [None]:
df_oecd.describe()

In [None]:
df_non_oecd.describe()

In [None]:
from scipy import stats

# Assuming you have your data in two arrays: oecd_data and non_oecd_data
# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(df_oecd, df_non_oecd, equal_var=False)

with open('t_test_results.pkl', 'wb') as f:
    pickle.dump({'t_stat': t_stat, 'p_value': p_value}, f)

# Analyze the p-value
alpha = 0.05  # significance level
print(p_value)
if p_value < alpha:
    print("There is a statistically significant difference between OECD and non-OECD countries.")
else:
    print("There is no statistically significant difference.")

With a p-value of 0.7518, which is higher than the typical significance level of 0.05, there isn't sufficient evidence to reject the null hypothesis. Therefore, based on this analysis, the recycled-waste-ratio is not significantly higher in OECD countries compared to non-OECD countries.

## Hypothesis Testing of GDP influence on Plastic Waste Generation

HYPOTHESIS TESTING: linear regression analysis

Hypothesis H0: plastic waste per capita is not dependent on GDP per Capita.

In [None]:
gdp_df_2010 = pd.read_csv('../data/cleaned_data/2010_gdp_pop_final.csv')

In [None]:
correlation = gdp_df_2010['gdp_per_capita'].corr(gdp_df_2010['per_capita_plastic_waste'])
correlation

A correlation of 0.37 suggests a moderate positive relationship.
The correlation is positive --> indicates that as the values of one variable increase the values of the other variable tend to increase as well

In [None]:
correlation = gdp_df_2010['population'].corr(gdp_df_2010['per_capita_plastic_waste'])
correlation

Relatively weak negative correlation, indicating that the relationship between these two variables is not very strong between the population size and per capita plastic waste.

In [None]:
plt.scatter(gdp_df_2010['gdp_per_capita'], gdp_df_2010['per_capita_plastic_waste'])
plt.xlabel('Independent Variable GDP')
plt.ylabel('Dependent Variable waste per capita')
plt.title('Scatter Plot')
plt.show()

In [None]:
plt.boxplot(gdp_df_2010['per_capita_plastic_waste'])
plt.title('Box Plot of per capita plastic waste')
plt.show()

In [None]:
#If skewness is less than -1 or greater than 1, the distribution is highly skewed and needs transforming.
gdp_df_2010[['gdp_per_capita', 'per_capita_plastic_waste']].skew()

In [None]:
from sklearn.preprocessing import PowerTransformer

gdp_df2 = gdp_df_2010[['gdp_per_capita', 'per_capita_plastic_waste']]

transformer2 = PowerTransformer(method='yeo-johnson')

transformed2 = transformer.fit_transform(gdp_df2)
df_transformed2 = pd.DataFrame(transformed2, columns=gdp_df2.columns)

path = "./transformers/"
file_name = "transf_gdp_per_capita_waste.pkl"

with open(path + file_name, "wb") as file:
    pickle.dump(transformer2, file)

In [None]:
plt.boxplot(df_transformed2['per_capita_plastic_waste'])
plt.title('Box Plot of per capita plastic waste')
plt.show()

In [None]:
X = df_transformed2[['gdp_per_capita']]
y = df_transformed2['per_capita_plastic_waste']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model2 = LinearRegression()

# Fit the model to the training data
model2.fit(X_train, y_train)

# Print the coefficient for gdp_per_capita
print(f'Coefficient for gdp_per_capita: {model2.coef_[0]}')

# Statistical Test
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(X_train['gdp_per_capita'], y_train)
print(f'P-value: {p_value}')

path = "./models/"
file_name = "lr_gdp_per_capita_waste.pkl"

with open(path + file_name, "wb") as file:
    pickle.dump(model2, file)

In [None]:
print(p_value<model2.coef_[0])

# Testing if the H0 needs to be rejected and determine if the overall model is statistically significant
if p_value<0.05:
    print("Reject H0")
else: print("Don't reject H0")

The coefficient for gdp_per_capita of approximately 0.612 indicates that, in the context of the analysis and after transformations, for every one-unit increase in the transformed GDP per capita, the transformed plastic waste per capita increases by approximately 0.612 units, assuming other variables remain constant.

With a p-value of 3.53e-13, which is significantly lower than the significance level of 0.05, there is a strong evidence to reject the null hypothesis, implying that this acknowledges that plastic waste per capita is indeed related to or dependent on GDP per capita, as suggested by the statistical analysis.

The statistically significant relationship found between GDP per capita and plastic waste per capita suggests that there is an association between these variables, supporting the alternative hypothesis (H1) that there is some relationship or dependence between the two.