In [None]:
T-distribution:
# The T-distribution, also known as Student's T-distribution, is a probability distribution used in statistics, particularly in hypothesis testing and confidence interval estimation when the sample size is small and/or the population standard deviation is unknown.
# It is similar to the normal distribution but has heavier tails, meaning it is more prone to producing values far from its mean.
# As the sample size increases, the T-distribution approaches the normal distribution.

In [1]:
import numpy as np
import scipy.stats as stats

# Given data
sample_mean = 150
sample_std = 10
sample_size = 15
confidence_level = 0.95

# Calculate the standard error of the mean (SEM)
sem = sample_std / np.sqrt(sample_size)

# Calculate the t-critical value
t_critical = stats.t.ppf((1 + confidence_level) / 2, df=sample_size-1)

# Calculate the margin of error
margin_of_error = t_critical * sem

# Calculate the confidence interval
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

# Display the results
print(f"Sample Mean: {sample_mean} grams")
print(f"T-Critical Value: {t_critical:.3f}")
print(f"Margin of Error: {margin_of_error:.3f} grams")
print(f"95% Confidence Interval: {confidence_interval} grams")


Sample Mean: 150 grams
T-Critical Value: 2.145
Margin of Error: 5.538 grams
95% Confidence Interval: (144.4621845843536, 155.5378154156464) grams


In [None]:
# Data Preprocessing:
# Data preprocessing is the process of transforming raw data into a clean and usable format before feeding it into a machine learning model. This step is crucial for improving the model's performance.
# Common steps in data preprocessing include:
# Data Cleaning: Handling missing values, removing duplicates, and correcting errors in the data.
# Data Transformation: Normalizing or standardizing data, encoding categorical variables, and transforming skewed data.
# Feature Engineering: Creating new features from existing data to improve model accuracy.
# Data Splitting: Dividing the dataset into training, validation, and test sets.

In [None]:
# Given Dataset (Hypothetical):
# Columns:
# CustomerID: Unique identifier for each customer
# Age: Age of the customer
# Gender: Gender of the customer (Male, Female)
# AnnualIncome: Annual income of the customer
# Purchased: Whether the customer made a purchase (0: No, 1: Yes)
# Steps in Data Preprocessing:
# Handling Missing Values
# Encoding Categorical Variables
# Feature Scaling
# Splitting the Dataset into Training and Test Sets

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Load the data into a DataFrame
data = {
    'CustomerID': [1, 2, 3, 4, 5],
    'Age': [25, 30, 35, None, 40],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Salary': [50000, None, 60000, 70000, 80000],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)

# Step 2: Handling Missing Values
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df['Salary'] = imputer.fit_transform(df[['Salary']])

# Step 3: Encoding Categorical Variables
label_encoder_gender = LabelEncoder()
df['Gender'] = label_encoder_gender.fit_transform(df['Gender'])

label_encoder_purchased = LabelEncoder()
df['Purchased'] = label_encoder_purchased.fit_transform(df['Purchased'])

# Step 4: Feature Scaling
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

# Step 5: Splitting the Data into Training and Test Sets
X = df[['Age', 'Gender', 'Salary']]  # Features
y = df['Purchased']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the processed data and the train/test splits
print("Processed DataFrame:")
print(df)
print("\nTraining Features (X_train):")
print(X_train)
print("\nTest Features (X_test):")
print(X_test)
print("\nTraining Labels (y_train):")
print(y_train)
print("\nTest Labels (y_test):")
print(y_test)


  from pandas.core import (


Processed DataFrame:
   CustomerID  Age  Gender  Salary  Purchased
0           1 -1.5       1    -1.5          1
1           2 -0.5       0     0.0          0
2           3  0.5       0    -0.5          1
3           4  0.0       1     0.5          0
4           5  1.5       0     1.5          1

Training Features (X_train):
   Age  Gender  Salary
4  1.5       0     1.5
2  0.5       0    -0.5
0 -1.5       1    -1.5
3  0.0       1     0.5

Test Features (X_test):
   Age  Gender  Salary
1 -0.5       0     0.0

Training Labels (y_train):
4    1
2    1
0    1
3    0
Name: Purchased, dtype: int32

Test Labels (y_test):
1    0
Name: Purchased, dtype: int32
