## Predicting Diabetes Onset Using Logistic Regression
Dataset: Diabetes Dataset   

Preprocessing Steps:
Handle missing values if any.
Standardize features.
Encode categorical variables if any.

Task:
Implement logistic regression to predict diabetes onset and evaluate the model using accuracy, precision, and recall.



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load dataset
df = pd.read_csv('Datasets/diabetes.csv') 

# Preprocessing
df.fillna(df.median(), inplace=True)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 0.7532467532467533
Precision: 0.6491228070175439
Recall: 0.6727272727272727


### Classifying Iris Species Using Decision Trees
   Dataset: 
Iris Dataset

   Preprocessing Steps:
Handle missing values if any.
Standardize features.
Encode categorical variables if any.

   Task: 
Implement a decision tree classifier to classify iris species and evaluate the model using confusion matrix and accuracy


In [2]:
# Task 2: Classifying Iris Species Using Decision Trees

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Classifier model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 1.00
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


### Predicting Titanic Survival Using Logistic Regression
   Dataset: 
Titanic Dataset

   Preprocessing Steps:
Handle missing values (e.g., fill missing ages with median).
Encode categorical variables (e.g., one-hot encoding for embarked and gender).
Standardize numerical features.

   Task:
Implement logistic regression to predict survival on the Titanic and evaluate the model using ROC-AUC.


In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Load the dataset
titanic_data = pd.read_csv('Datasets/titanic.csv')


# Handle missing values
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())
titanic_data['Embarked'] = titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0])
titanic_data['Fare'] = titanic_data['Fare'].fillna(titanic_data['Fare'].median())

# Define preprocessing for numerical and categorical features
numerical_features = ['Age', 'Fare']
categorical_features = ['Embarked', 'Sex']

# Create a column transformer with an imputer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Split the data
X = titanic_data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = titanic_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'ROC-AUC: {roc_auc}')



ROC-AUC: 1.0


### Classifying Emails as Spam Using Decision Trees
Dataset: 
Spam Email Dataset

Preprocessing Steps:
Handle missing values if any.
Standardize features.
Encode categorical variables if present.

Task: 
Implement a decision tree classifier to classify emails as spam or not and evaluate the model using precision, recall, and F1-score.



In [21]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset
spam_data = pd.read_csv('Datasets/emails.csv')

target_column = 'Prediction'

# Handle missing values and separate numerical and categorical columns
numerical_cols = spam_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = spam_data.select_dtypes(include=['object']).columns

# Ensure the target column is not included in the features
if target_column in numerical_cols:
    numerical_cols = numerical_cols.drop(target_column)
if target_column in categorical_cols:
    categorical_cols = categorical_cols.drop(target_column)

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])

# Split the data
X = spam_data.drop(target_column, axis=1)
y = spam_data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Precision: 0.8609271523178808
Recall: 0.8783783783783784
F1 Score: 0.8695652173913043


### Predicting Customer Satisfaction Using Logistic Regression
Dataset: 
Customer Satisfaction Dataset

Preprocessing Steps:
Handle missing values (e.g., fill missing values with median).
Encode categorical variables (e.g., one-hot encoding for region).
Standardize numerical features.

Task: 
Implement logistic regression to predict customer satisfaction and evaluate the model using accuracy and confusion matrix.


In [25]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
customer_data = pd.read_csv('Datasets/restaurant_customer_satisfaction.csv')


# Handle missing values by imputing numerical columns with median
numerical_cols = customer_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = customer_data.select_dtypes(include=['object']).columns

# Ensure the target column is not included in the features
target_column = 'HighSatisfaction'  # Adjusted based on the dataset inspection
if target_column in numerical_cols:
    numerical_cols = numerical_cols.drop(target_column)
if target_column in categorical_cols:
    categorical_cols = categorical_cols.drop(target_column)

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])

# Split the data
X = customer_data.drop(target_column, axis=1)
y = customer_data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 0.9133333333333333
Confusion Matrix:
[[251   8]
 [ 18  23]]
