### Predicting Employee Attrition Using Logistic Regression
   Dataset: HR Analytics Employee Attrition Dataset

   Preprocessing Steps:

   - Handle missing values if any.
   - Encode categorical variables (e.g., one-hot encoding for department, gender, etc.).
   - Standardize numerical features.
     
   Task: Implement logistic regression to predict employee attrition and evaluate the model using precision, recall, and F1-score.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Load dataset
df = pd.read_csv('Datasets/HR-Employee-Attrition.csv')

# Define features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(), X.select_dtypes(include=['object']).columns)
    ])

# Create pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='Yes')
accuracy = accuracy_score(y_test, y_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')


Precision: 0.5476190476190477
Recall: 0.3770491803278688
F1 Score: 0.44660194174757284
Accuracy: 0.8707482993197279


### Classifying Credit Card Fraud Using Decision Trees

   Dataset: Credit Card Fraud Detection Dataset

   Preprocessing Steps:

   - Handle missing values if any.
   - Standardize features.
    
   Task: Implement a decision tree classifier to classify credit card transactions as fraud or not and evaluate the model using ROC-AUC and confusion matrix.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix

# Load dataset
df = pd.read_csv('Datasets/card_transdata.csv')

# Define features and target
X = df.drop('fraud', axis=1)
y = df['fraud']

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'ROC-AUC: {roc_auc}')
print(f'Confusion Matrix:\n{conf_matrix}')


ROC-AUC: 0.9999599026707605
Confusion Matrix:
[[273870      1]
 [     2  26127]]


### Predicting Heart Disease Using Logistic Regression
   Dataset: Heart Disease Dataset

   Preprocessing Steps:

   - Handle missing values (e.g., fill missing values with mean).
   - Encode categorical variables (e.g., one-hot encoding for gender, chest pain type, etc.).
   - Standardize numerical features.
   
   Task: Implement logistic regression to predict heart disease and evaluate the model using accuracy and ROC-AUC.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Load dataset
df = pd.read_csv('Datasets/heart.csv')

# Define features and target
X = df.drop('target', axis=1)
y = df['target']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(), X.select_dtypes(include=['object']).columns)
    ])

# Create pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f'Accuracy: {accuracy}')
print(f'ROC-AUC: {roc_auc}')


Accuracy: 0.8051948051948052
ROC-AUC: 0.8967962517411675


### Classifying Emails as Spam Using Decision Trees
  Dataset: Spam Email Dataset
  
  Preprocessing Steps:
  - Handle missing values if any.
  - Standardize features.
  - Encode categorical variables if present.

  Task: Implement a decision tree classifier to classify emails as spam or not and evaluate the model using precision, recall, and F1-score.


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

# Load dataset
df = pd.read_csv('Datasets/emails.csv')

# Define features and target
X = df.drop('Prediction', axis=1)
y = df['Prediction']

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Precision: 0.95
Recall: 0.96
F1 Score: 0.95


### Predicting Customer Satisfaction Using Logistic Regression

Dataset: Customer Satisfaction Dataset

Preprocessing Steps:
- Handle missing values (e.g., fill missing values with median).
- Encode categorical variables (e.g., one-hot encoding for region).
- Standardize numerical features.

Task: Implement logistic regression to predict customer satisfaction and evaluate the model using accuracy and confusion matrix.

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('Datasets/E-commerce Customer Behavior - Sheet1.csv')

# Define features and target
X = df.drop('Satisfaction Level', axis=1)
y = df['Satisfaction Level']

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000))])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Handle any remaining NaN values in the target variable (if any)
y_train = y_train.fillna(y_train.mode()[0])
y_test = y_test.fillna(y_test.mode()[0])

# Train model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 0.9904761904761905
Confusion Matrix:
[[41  0  0]
 [ 1 39  0]
 [ 0  0 24]]
