# Importing libraries

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import statsmodels.api as sm

In [None]:
from statsmodels.formula.api import ols

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

# Load the dataset into a data frame

In [None]:
lungcapacity_df = pd.read_excel ('/Users/ajnazikamir/Desktop/LungCapData2.xlsx')

In [None]:
lungcapacity_df

# Display the first few rows of the DataFrame to understand its structure

In [None]:
print(lungcapacity_df.head())

# Check the data types and structure of the DataFrame

In [None]:
print(lungcapacity_df.info())

# Check for missing data

In [None]:
print(lungcapacity_df.isnull().sum())

# Describe the statistical summary of non-binary numerical variables

In [None]:
print(lungcapacity_df.describe())

# Check for null values

In [None]:
null_values = lungcapacity_df.isnull().sum()

In [None]:
print("Null values in the dataset:\n", null_values)

# Here is another way for descriptive statistical analysis

In [None]:
description = lungcapacity_df.describe()

In [None]:
print("Descriptive Statistical Analysis:\n", description)

# Visualize the distribution of numerical variables

In [None]:
plt.figure(figsize=(12, 4))

# Distribution of Age

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.histplot(lungcapacity_df['Age'], kde=True, color='green', bins=50)

In [None]:
plt.title('Distribution of Age')

In [None]:
plt.xlabel('Age')

In [None]:
plt.ylabel('Frequency')

In [None]:
plt.show()

# Distribution of Lung Capacity

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.histplot(lungcapacity_df['LungCap'], kde=True, color='blue', bins=50)

In [None]:
plt.title('Distribution of Lung Capacity')

In [None]:
plt.xlabel('Lung Capacity')

In [None]:
plt.ylabel('Frequency')

In [None]:
plt.show()

# Distribution of Height

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.histplot(lungcapacity_df['Height'], kde=True, color='red',bins=50)

In [None]:
plt.title('Distribution of Height')

# Count the occurrences of each gender

In [None]:
gender_counts = lungcapacity_df['Gender'].value_counts()

In [None]:
print("Occurrences of each gender:\n", gender_counts)

# Visualize the count of occurrences for each gender

In [None]:
plt.figure(figsize=(6, 4))

In [None]:
sns.countplot(x='Gender', data=lungcapacity_df, palette='viridis')

In [None]:
plt.title('Occurrences of Each Gender')

In [None]:
plt.xlabel('Gender')

In [None]:
plt.ylabel('Count')

In [None]:
plt.xticks(ticks=range(len(gender_counts)), labels=gender_counts.index)

In [None]:
plt.show()

# Count the occurrences of smoking status

In [None]:
smoking_counts = lungcapacity_df['Smoke'].value_counts()

In [None]:
print("Occurrences of smoking status:\n", smoking_counts)

# Visualize the count of occurrences for smoking status

In [None]:
plt.figure(figsize=(6, 4))

In [None]:
sns.countplot(x='Smoke', data=lungcapacity_df, palette='viridis')

In [None]:
plt.title('Occurrences of Smoking Status')

In [None]:
plt.xlabel('Smoking Status')

In [None]:
plt.ylabel('Count')

In [None]:
plt.xticks(ticks=range(len(smoking_counts)), labels=smoking_counts.index)

In [None]:
plt.show()

# Convert categorical variables to numeric using one-hot encoding

In [None]:
lungcapacity_df_encoded = pd.get_dummies(lungcapacity_df, columns=['Gender', 'Smoke'], drop_first=True)

# Correlation matrix

In [None]:
correlation_matrix = lungcapacity_df_encoded.corr()

In [None]:
print("Correlation matrix:\n", correlation_matrix)

# Heatmap of correlation matrix

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})

In [None]:
plt.title('Correlation Matrix')

In [None]:
plt.show()

# Select columns 'Age', 'LungCap', and 'Height' for correlation matrix

In [None]:
selected_columns = ['Age', 'LungCap', 'Height']

In [None]:
correlation_matrix = lungcapacity_df[selected_columns].corr()

# Print correlation matrix

In [None]:
print("Correlation matrix for Age, LungCap, and Height:\n", correlation_matrix)

# Heatmap of correlation matrix

In [None]:
plt.figure(figsize=(6, 4))

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})

In [None]:
plt.title('Correlation Matrix (Age, LungCap, Height)')

In [None]:
plt.show()

# Cross-tabulations between 'Gender' and 'Smoke'

In [None]:
cross_tab = pd.crosstab(lungcapacity_df['Gender'], lungcapacity_df['Smoke'])

In [None]:
print("Cross-tabulations between Gender and Smoke:\n", cross_tab)

# Summary table comparing mean Lung Capacity across different genders

In [None]:
summary_gender = lungcapacity_df.groupby('Gender')['LungCap'].describe()

In [None]:
print("Summary table comparing mean LungCap across different genders:\n", summary_gender)

# Summary table comparing mean Lung Capacity across different smoking statuses

In [None]:
summary_smoke = lungcapacity_df.groupby('Smoke')['LungCap'].describe()

In [None]:
print("Summary table comparing mean LungCap across different smoking statuses:\n", summary_smoke)

# Box plot of Lung Capacity by Gender

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.boxplot(x='Gender', y='LungCap', data=lungcapacity_df)

In [None]:
plt.title('Box plot of LungCap by Gender')

In [None]:
plt.xlabel('Gender')

In [None]:
plt.ylabel('Lung Capacity')

In [None]:
plt.show()

# Box plot of Lung Capacity by Smoking Status

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.boxplot(x='Smoke', y='LungCap', data=lungcapacity_df)

In [None]:
plt.title('Box plot of LungCap by Smoking Status')

In [None]:
plt.xlabel('Smoking Status')

In [None]:
plt.ylabel('Lung Capacity')

In [None]:
plt.show()

# Scatter plot of Lung Capacity vs Height

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.scatterplot(x='Height', y='LungCap', data=lungcapacity_df)

In [None]:
plt.title('Scatter plot of Lung Capacity vs Height')

In [None]:
plt.xlabel('Height')

In [None]:
plt.ylabel('Lung Capacity')

In [None]:
plt.show()

# Scatter plot of Lung Capacity vs Age

In [None]:
plt.figure(figsize=(8, 6))

In [None]:
sns.scatterplot(x='Age', y='LungCap', data=lungcapacity_df)

In [None]:
plt.title('Scatter plot of Lung Capacity vs Age')

In [None]:
plt.xlabel('Age')

In [None]:
plt.ylabel('Lung Capacity')

In [None]:
plt.show()

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table between 'Gender' and 'Smoke'

In [None]:
contingency_table = pd.crosstab(lungcapacity_df['Gender'], lungcapacity_df['Smoke'])

# Perform the chi-square test

In [None]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

# Print the results

In [None]:
print("Chi-square statistic:", chi2_stat)

In [None]:
print("P-value:", p_val)

In [None]:
print("Degrees of freedom:", dof)

In [None]:
print("Expected frequencies:\n", expected)

# Determine the significance of the results

In [None]:
alpha = 0.05  # significance level

In [None]:
if p_val < alpha:

In [None]:
print("There is a significant association between Gender and Smoking Status.")

In [None]:
else:

In [None]:
print("There is no significant association between Gender and Smoking Status.")

# Additional information on who smokes and who doesn't by gender

In [None]:
smoking_by_gender = lungcapacity_df.groupby('Gender')['Smoke'].value_counts()

In [None]:
print("\nSmoking status by gender:\n", smoking_by_gender)

# Model 1: LungCap vs Height

In [None]:
model1 = ols('LungCap ~ Height', data=lungcapacity_df).fit()

# Model 2: LungCap vs (Height + Height^2)

In [None]:
lungcapacity_df['Height_squared'] = lungcapacity_df['Height']**2

In [None]:
model2 = ols('LungCap ~ Height + Height_squared', data=lungcapacity_df).fit()

# Model 3: LungCap vs (Height + Height^2 + Height^3)

In [None]:
lungcapacity_df['Height_cubed'] = lungcapacity_df['Height']**3

In [None]:
model3 = ols('LungCap ~ Height + Height_squared + Height_cubed', data=lungcapacity_df).fit()

# ANOVA for Model 1 vs Model 2

In [None]:
anova_results1_2 = sm.stats.anova_lm(model1, model2)

In [None]:
print("ANOVA results for Model 1 vs Model 2:\n", anova_results1_2)

# ANOVA for Model 2 vs Model 3

In [None]:
anova_results2_3 = sm.stats.anova_lm(model2, model3)

In [None]:
print("\nANOVA results for Model 2 vs Model 3:\n", anova_results2_3)

# Create a range of values for Height

In [None]:
height_values = np.linspace(lungcapacity_df['Height'].min(), lungcapacity_df['Height'].max(), 100)

# Predictions for each model

In [None]:
model1_predictions = model1.predict({'Height': height_values})

In [None]:
model2_predictions = model2.predict({'Height': height_values, 'Height_squared': height_values**2})

In [None]:
model3_predictions = model3.predict({'Height': height_values, 'Height_squared': height_values**2, 'Height_cubed': height_values**3})

# Plot all 3 models

In [None]:
plt.figure(figsize=(10, 6))

In [None]:
plt.scatter(lungcapacity_df['Height'], lungcapacity_df['LungCap'], color='black', label='Actual', alpha=0.5)

In [None]:
plt.plot(height_values, model1_predictions, color='red', label='Model 1', linewidth=2)

In [None]:
plt.plot(height_values, model2_predictions, color='blue', label='Model 2', linewidth=2)

In [None]:
plt.plot(height_values, model3_predictions, color='green', label='Model 3', linewidth=2)

In [None]:
plt.xlabel('Height')

In [None]:
plt.ylabel('Lung Capacity')

In [None]:
plt.title('Comparison of Models')

In [None]:
plt.legend()

In [None]:
plt.grid(True)

In [None]:
plt.show()

# Create tables for coefficients and goodness-of-fit statistics

In [None]:
tables = [model1.summary().tables[1], model2.summary().tables[1], model3.summary().tables[1]]

In [None]:
titles = ['Model 1 Coefficients', 'Model 2 Coefficients', 'Model 3 Coefficients']

In [None]:
for title, table in zip(titles, tables):

In [None]:
print(title)

In [None]:
print(table)

In [None]:
print("\nModel 1 R-squared:", model1.rsquared)

In [None]:
print("Model 2 R-squared:", model2.rsquared)

In [None]:
print("Model 3 R-squared:", model3.rsquared)