In [None]:
#T-distribution:
# The T-distribution, also known as Student's T-distribution, 
#is a probability distribution used in statistics, particularly in hypothesis testing
#and confidence interval estimation when the sample size is small and/or the population standard deviation is unknown.
# It is similar to the normal distribution but has heavier tails, meaning 
#it is more prone to producing values far from its mean.
# As the sample size increases, the T-distribution approaches the normal distribution.

In [6]:
import numpy as np
import scipy.stats as stats

In [7]:
#Give data
sample_mean=150
sample_std=10
sample_size=15
confidence_level=0.95

In [8]:
#Calculate the standard error of the mean
sem=sample_std/np.sqrt(sample_size)
sem

2.581988897471611

In [9]:
#calculate the T-critical value
t_critical=stats.t.ppf((1+confidence_level)/2,df=sample_size-1)
#Calculate the margin of error
margin_of_error=t_critical*sem

In [10]:
margin_of_error

5.537815415646416

In [None]:
#calculate the confidence interval

In [11]:
confidence_intervel=(sample_mean-margin_of_error,sample_mean+margin_of_error)

In [12]:
confidence_intervel


(144.4621845843536, 155.5378154156464)

In [13]:
#Display the result
print("sampleMean::",sample_mean)
print("T-CriticalValue::",t_critical)
print("Margin of error::",margin_of_error)
print("Confidence Intervel::",confidence_intervel)

sampleMean:: 150
T-CriticalValue:: 2.1447866879169273
Margin of error:: 5.537815415646416
Confidence Intervel:: (144.4621845843536, 155.5378154156464)


In [None]:
# Data Preprocessing:
# Data preprocessing is the process of transforming raw data into a clean and usable format before feeding it into a machine learning model. This step is crucial for improving the model's performance.
# Common steps in data preprocessing include:
# Data Cleaning: Handling missing values, removing duplicates, and correcting errors in the data.
# Data Transformation: Normalizing or standardizing data, encoding categorical variables, and transforming skewed data.
# Feature Engineering: Creating new features from existing data to improve model accuracy.
# Data Splitting: Dividing the dataset into training, validation, and test sets.

In [23]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [24]:
data = {
    'CustomerID': [1, 2, 3, 4, 5],
    'Age': [25, 30, 35, None, 40],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Salary': [50000, None, 60000, 70000, 80000],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']
}


In [25]:
data

{'CustomerID': [1, 2, 3, 4, 5],
 'Age': [25, 30, 35, None, 40],
 'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
 'Salary': [50000, None, 60000, 70000, 80000],
 'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']}

In [26]:
df=pd.DataFrame(data)

In [27]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Purchased
0,1,25.0,Male,50000.0,Yes
1,2,30.0,Female,,No
2,3,35.0,Female,60000.0,Yes
3,4,,Male,70000.0,No
4,5,40.0,Female,80000.0,Yes


In [28]:
imputer=SimpleImputer(strategy='mean')
df['Age']=imputer.fit_transform(df[['Age']])
df['Salary']=imputer.fit_transform(df[['Salary']])


In [29]:
label_encoder_gender=LabelEncoder()
df['Gender']=label_encoder_gender.fit_transform(df['Gender'])

In [30]:
label_encoder_purchased = LabelEncoder()
df['Purchased'] = label_encoder_purchased.fit_transform(df['Purchased'])

# Step 4: Feature Scaling
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

In [35]:
X=df[['Age','Gender','Salary']]
Y=df['Purchased']

In [32]:
x

Unnamed: 0,Age,Gender,Salary
0,-1.5,1,-1.5
1,-0.5,0,0.0
2,0.5,0,-0.5
3,0.0,1,0.5
4,1.5,0,1.5


In [33]:
y

0    1
1    0
2    1
3    0
4    1
Name: Purchased, dtype: int32

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the processed data and the train/test splits
print("Processed DataFrame:")
print(df)
print("\nTraining Features (X_train):")
print(X_train)
print("\nTest Features (X_test):")
print(X_test)
print("\nTraining Labels (y_train):")
print(y_train)
print("\nTest Labels (y_test):")
print(y_test)

Processed DataFrame:
   CustomerID  Age  Gender  Salary  Purchased
0           1 -1.5       1    -1.5          1
1           2 -0.5       0     0.0          0
2           3  0.5       0    -0.5          1
3           4  0.0       1     0.5          0
4           5  1.5       0     1.5          1

Training Features (X_train):
   Age  Gender  Salary
4  1.5       0     1.5
2  0.5       0    -0.5
0 -1.5       1    -1.5
3  0.0       1     0.5

Test Features (X_test):
   Age  Gender  Salary
1 -0.5       0     0.0

Training Labels (y_train):
4    1
2    1
0    1
3    0
Name: Purchased, dtype: int32

Test Labels (y_test):
1    0
Name: Purchased, dtype: int32
