In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df=pd.read_csv('weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [None]:
# Dataset Information
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print("\nGender Distribution:")
print(df['Gender'].value_counts())

In [None]:
## Data Visualization

In [None]:
# Distribution of Height and Weight
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Height distribution
axes[0].hist(df['Height'], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Height (inches)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Height')

# Weight distribution
axes[1].hist(df['Weight'], bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Weight (lbs)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Weight')

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Height vs Weight
plt.figure(figsize=(10, 6))
plt.scatter(df['Height'], df['Weight'], alpha=0.5, c='steelblue', edgecolor='black', linewidth=0.5)
plt.xlabel('Height (inches)')
plt.ylabel('Weight (lbs)')
plt.title('Height vs Weight - Scatter Plot')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Scatter plot by Gender
plt.figure(figsize=(10, 6))
colors = {'Male': 'blue', 'Female': 'red'}
for gender in df['Gender'].unique():
    subset = df[df['Gender'] == gender]
    plt.scatter(subset['Height'], subset['Weight'], alpha=0.5, c=colors[gender], label=gender, edgecolor='black', linewidth=0.3)
plt.xlabel('Height (inches)')
plt.ylabel('Weight (lbs)')
plt.title('Height vs Weight by Gender')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Correlation Analysis

In [None]:
# Correlation between Height and Weight
correlation = df['Height'].corr(df['Weight'])
print(f"Correlation between Height and Weight: {correlation:.4f}")

# Correlation matrix
import seaborn as sns
plt.figure(figsize=(8, 6))
numeric_df = df[['Height', 'Weight']]
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.4f')
plt.title('Correlation Heatmap')
plt.show()