# Linear Regression

This code generates a population of 1000 random heights, takes a sample of 100 heights from the population, and calculates the mean, standard deviation, and confidence interval for the population mean based on the sample. 

It also calculates the correlation between height and weight in a dataset of 500 people and fits a linear regression model to predict weight based on height in the same dataset.

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Generate a population of 1000 random heights (in inches) with mean 68 and standard deviation 3
population = np.random.normal(68, 3, 1000)

# Take a sample of 100 heights from the population
sample = np.random.choice(population, 100)

# Calculate the mean and standard deviation of the sample
sample_mean = np.mean(sample)
sample_std_dev = np.std(sample)

# Calculate a 95% confidence interval for the population mean based on the sample
z_critical = 1.96 # for a 95% confidence interval
margin_of_error = z_critical * (3 / np.sqrt(100))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

# Generate a random dataset of 500 people with height and weight columns
data = pd.DataFrame({
    "Height": np.random.normal(68, 3, 500),
    "Weight": np.random.normal(150, 20, 500)
})

# Calculate the correlation between height and weight in the dataset
correlation = data["Height"].corr(data["Weight"])

# Fit a linear regression model to predict weight based on height in the dataset
X = data["Height"].values.reshape(-1, 1)
y = data["Weight"].values.reshape(-1, 1)
regressor = LinearRegression()
regressor.fit(X, y)
predicted_weights = regressor.predict(X)

# Print the results
print("Population mean height:", np.mean(population))
print("Sample mean height:", sample_mean)
print("Sample standard deviation:", sample_std_dev)
print("95% confidence interval for population mean height:", confidence_interval)
print("Correlation between height and weight:", correlation)
print("Predicted weights based on height:")
print(predicted_weights)

Population mean height: 68.05856845004679
Sample mean height: 68.13877426145227
Sample standard deviation: 2.7725216186750425
95% confidence interval for population mean height: (67.55077426145228, 68.72677426145226)
Correlation between height and weight: -0.02940996853908253
Predicted weights based on height:
[[152.57707712]
 [152.72530972]
 [152.22844628]
 [151.57058413]
 [151.6664627 ]
 [152.40703015]
 [150.95697336]
 [151.33578348]
 [152.01651072]
 [152.06949417]
 [151.91616741]
 [151.49366741]
 [151.11497771]
 [151.90353623]
 [152.39017792]
 [152.7932614 ]
 [151.88555215]
 [152.29794391]
 [151.07939253]
 [151.5443074 ]
 [151.74443591]
 [151.01576179]
 [151.85683704]
 [151.65208739]
 [151.87396634]
 [151.88370101]
 [150.7090615 ]
 [152.77783292]
 [152.2093013 ]
 [151.74368141]
 [152.52651786]
 [152.21800681]
 [150.71893943]
 [152.42792824]
 [152.04269859]
 [153.11889807]
 [152.32775602]
 [151.45635785]
 [151.14516417]
 [152.05667103]
 [152.08700123]
 [152.19463038]
 [151.77820506]
