1. Predicting Employee Attrition Using Logistic Regression

   Dataset: HR Analytics Employee Attrition Dataset

   Preprocessing Steps:

     - Handle missing values if any.

     - Encode categorical variables (e.g., one-hot encoding for department, gender, etc.).

     - Standardize numerical features.
     
   Task: Implement logistic regression to predict employee attrition and evaluate the model using precision, recall, and F1-score.

In [61]:
import pandas as pd
df= pd.read_csv('HR-Employee-Attrition.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [62]:
df.dropna()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [63]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [64]:
# columns_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']

# # Drop the columns
# df = df.drop(columns=columns_to_drop)

columns_to_drop = [ 'StandardHours', 'EmployeeNumber','EmployeeCount','HourlyRate', 'JobInvolvement', 
                    'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                   'Over18', 'OverTime', 'PercentSalaryHike']

# Drop the columns
df = df.drop(columns=columns_to_drop, axis=1)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [65]:
from sklearn.preprocessing import OneHotEncoder
encoder= OneHotEncoder(sparse_output= False)
department = df['Department'].values.reshape(-1, 1)
df['Department']= encoder.fit_transform(department)


df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,0.0,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,0.0,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,0.0,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,0.0,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,0.0,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,0.0,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,0.0,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,0.0,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,0.0,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [66]:
BusinessTravel = df['BusinessTravel'].values.reshape(-1, 1)
df['BusinessTravel']= encoder.fit_transform(BusinessTravel)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,0.0,1102,0.0,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,0.0,279,0.0,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,0.0,1373,0.0,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,0.0,1392,0.0,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,0.0,591,0.0,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,0.0,884,0.0,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,0.0,613,0.0,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,0.0,155,0.0,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,0.0,1023,0.0,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [67]:
EducationField = df['EducationField'].values.reshape(-1, 1)
df['EducationField']= encoder.fit_transform(EducationField)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,0.0,1102,0.0,1,2,0.0,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,0.0,279,0.0,8,1,0.0,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,0.0,1373,0.0,2,2,0.0,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,0.0,1392,0.0,3,4,0.0,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,0.0,591,0.0,2,1,0.0,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,0.0,884,0.0,23,2,0.0,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,0.0,613,0.0,6,1,0.0,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,0.0,155,0.0,4,3,0.0,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,0.0,1023,0.0,2,3,0.0,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [68]:
gender = df['Gender'].values.reshape(-1, 1)
df['Gender']= encoder.fit_transform(gender)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,0.0,1102,0.0,1,2,0.0,2,1.0,...,3,1,0,8,0,1,6,4,0,5
1,49,No,0.0,279,0.0,8,1,0.0,3,0.0,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,0.0,1373,0.0,2,2,0.0,4,0.0,...,3,2,0,7,3,3,0,0,0,0
3,33,No,0.0,1392,0.0,3,4,0.0,4,1.0,...,3,3,0,8,3,3,8,7,3,0
4,27,No,0.0,591,0.0,2,1,0.0,1,0.0,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,0.0,884,0.0,23,2,0.0,3,0.0,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,0.0,613,0.0,6,1,0.0,4,0.0,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,0.0,155,0.0,4,3,0.0,2,0.0,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,0.0,1023,0.0,2,3,0.0,4,0.0,...,3,4,0,17,3,2,9,6,0,8


In [69]:
Attrition = df['Attrition'].values.reshape(-1, 1)
df['Attrition']= encoder.fit_transform(Attrition)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,0.0,0.0,1102,0.0,1,2,0.0,2,1.0,...,3,1,0,8,0,1,6,4,0,5
1,49,1.0,0.0,279,0.0,8,1,0.0,3,0.0,...,4,4,1,10,3,3,10,7,1,7
2,37,0.0,0.0,1373,0.0,2,2,0.0,4,0.0,...,3,2,0,7,3,3,0,0,0,0
3,33,1.0,0.0,1392,0.0,3,4,0.0,4,1.0,...,3,3,0,8,3,3,8,7,3,0
4,27,1.0,0.0,591,0.0,2,1,0.0,1,0.0,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,1.0,0.0,884,0.0,23,2,0.0,3,0.0,...,3,3,1,17,3,3,5,2,0,3
1466,39,1.0,0.0,613,0.0,6,1,0.0,4,0.0,...,3,1,1,9,5,3,7,7,1,7
1467,27,1.0,0.0,155,0.0,4,3,0.0,2,0.0,...,4,2,1,6,0,3,6,2,0,3
1468,49,1.0,0.0,1023,0.0,2,3,0.0,4,0.0,...,3,4,0,17,3,2,9,6,0,8


In [70]:
JobRole = df['JobRole'].values.reshape(-1, 1)
df['JobRole']= encoder.fit_transform(JobRole)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,0.0,0.0,1102,0.0,1,2,0.0,2,1.0,...,3,1,0,8,0,1,6,4,0,5
1,49,1.0,0.0,279,0.0,8,1,0.0,3,0.0,...,4,4,1,10,3,3,10,7,1,7
2,37,0.0,0.0,1373,0.0,2,2,0.0,4,0.0,...,3,2,0,7,3,3,0,0,0,0
3,33,1.0,0.0,1392,0.0,3,4,0.0,4,1.0,...,3,3,0,8,3,3,8,7,3,0
4,27,1.0,0.0,591,0.0,2,1,0.0,1,0.0,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,1.0,0.0,884,0.0,23,2,0.0,3,0.0,...,3,3,1,17,3,3,5,2,0,3
1466,39,1.0,0.0,613,0.0,6,1,0.0,4,0.0,...,3,1,1,9,5,3,7,7,1,7
1467,27,1.0,0.0,155,0.0,4,3,0.0,2,0.0,...,4,2,1,6,0,3,6,2,0,3
1468,49,1.0,0.0,1023,0.0,2,3,0.0,4,0.0,...,3,4,0,17,3,2,9,6,0,8


In [71]:
MaritalStatus = df['MaritalStatus'].values.reshape(-1, 1)
df['MaritalStatus']= encoder.fit_transform(MaritalStatus)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,0.0,0.0,1102,0.0,1,2,0.0,2,1.0,...,3,1,0,8,0,1,6,4,0,5
1,49,1.0,0.0,279,0.0,8,1,0.0,3,0.0,...,4,4,1,10,3,3,10,7,1,7
2,37,0.0,0.0,1373,0.0,2,2,0.0,4,0.0,...,3,2,0,7,3,3,0,0,0,0
3,33,1.0,0.0,1392,0.0,3,4,0.0,4,1.0,...,3,3,0,8,3,3,8,7,3,0
4,27,1.0,0.0,591,0.0,2,1,0.0,1,0.0,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,1.0,0.0,884,0.0,23,2,0.0,3,0.0,...,3,3,1,17,3,3,5,2,0,3
1466,39,1.0,0.0,613,0.0,6,1,0.0,4,0.0,...,3,1,1,9,5,3,7,7,1,7
1467,27,1.0,0.0,155,0.0,4,3,0.0,2,0.0,...,4,2,1,6,0,3,6,2,0,3
1468,49,1.0,0.0,1023,0.0,2,3,0.0,4,0.0,...,3,4,0,17,3,2,9,6,0,8


In [72]:
JobSatisfaction = df['JobSatisfaction'].values.reshape(-1, 1)
df['JobSatisfaction']= encoder.fit_transform(JobSatisfaction)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,0.0,0.0,1102,0.0,1,2,0.0,2,1.0,...,3,1,0,8,0,1,6,4,0,5
1,49,1.0,0.0,279,0.0,8,1,0.0,3,0.0,...,4,4,1,10,3,3,10,7,1,7
2,37,0.0,0.0,1373,0.0,2,2,0.0,4,0.0,...,3,2,0,7,3,3,0,0,0,0
3,33,1.0,0.0,1392,0.0,3,4,0.0,4,1.0,...,3,3,0,8,3,3,8,7,3,0
4,27,1.0,0.0,591,0.0,2,1,0.0,1,0.0,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,1.0,0.0,884,0.0,23,2,0.0,3,0.0,...,3,3,1,17,3,3,5,2,0,3
1466,39,1.0,0.0,613,0.0,6,1,0.0,4,0.0,...,3,1,1,9,5,3,7,7,1,7
1467,27,1.0,0.0,155,0.0,4,3,0.0,2,0.0,...,4,2,1,6,0,3,6,2,0,3
1468,49,1.0,0.0,1023,0.0,2,3,0.0,4,0.0,...,3,4,0,17,3,2,9,6,0,8


In [73]:
X= df.drop(columns='Attrition')
y= df['Attrition']

In [74]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
standardized_features= scaler.fit_transform(X)

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(standardized_features, y, test_size=0.2, random_state=42)

In [76]:
# from sklearn.preprocessing import StandardScaler

# scaler= StandardScaler()
# standardized_features= scaler.fit_transform(X)


In [77]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(max_iter=500)
model.fit(X_train, y_train)


In [78]:
y_pred = model.predict(X_test)

In [79]:
from sklearn.metrics import precision_score, recall_score, f1_score
accuracy= precision_score(y_test, y_pred)
recall_score= recall_score(y_test, y_pred)
f1_score= f1_score(y_test, y_pred)


print(f'Accuracy: ', accuracy)
print(f'Recall Score: ', recall_score)
print(f'F1 score: ', f1_score)

Accuracy:  0.8846153846153846
Recall Score:  0.9921568627450981
F1 score:  0.9353049907578558


2. Classifying Credit Card Fraud Using Decision Trees

   Dataset: Credit Card Fraud Detection Dataset

   Preprocessing Steps:

     - Handle missing values if any.

     - Standardize features.
     
   Task: Implement a decision tree classifier to classify credit card transactions as fraud or not and evaluate the model using ROC-AUC and confusion matrix.

In [80]:
import pandas as pd
df=pd.read_csv('creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [81]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [82]:
X= df.drop(columns='Class')
y= df['Class']

In [83]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
standardized_features= scaler.fit_transform(X)


In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(standardized_features, y, test_size=0.2, random_state=42)

In [85]:
from sklearn.tree import DecisionTreeClassifier
tree= DecisionTreeClassifier()
tree.fit(X_train, y_train)


In [86]:
y_pred=tree.predict(X_test)
print(y_pred)

[1 0 0 ... 0 0 0]


In [87]:
from sklearn.metrics import confusion_matrix, roc_auc_score

confusion_matrix= confusion_matrix(y_test, y_pred)
roc_auc_score= roc_auc_score(y_test, y_pred)


print(f'Confusion Matrix: ', confusion_matrix)
print(f'roc_auc_score: ', roc_auc_score)


Confusion Matrix:  [[56834    30]
 [   20    78]]
roc_auc_score:  0.8976953963915336


3. Predicting Heart Disease Using Logistic Regression

   Dataset: Heart Disease Dataset

   Preprocessing Steps:

     - Handle missing values (e.g., fill missing values with mean).

     - Encode categorical variables (e.g., one-hot encoding for gender, chest pain type, etc.).

     - Standardize numerical features.
     
   Task: Implement logistic regression to predict heart disease and evaluate the model using accuracy and ROC-AUC.

In [88]:
import pandas as pd
df=pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [89]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [90]:
X= df.drop(columns='target')
y= df['target']

In [92]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
standardized_features= scaler.fit_transform(X)

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(standardized_features, y, test_size=0.2, random_state=42)


In [96]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression()
model.fit(X_train, y_train)

In [98]:
y_pred= model.predict(X_test)
print(y_pred)

[1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1
 1 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 1 0 1 0 1 1
 1 1 0 1 1 1 1 1 0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1
 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0]
