Title: Classfication<br>

Task 1:<br>
Objective: Identify if an email is spam or not spam.<br>
Load the UCI Spambase Dataset.<br>
Goal: Create a model that classifies emails into two categories: "spam" and "not spam."


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset from the UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f"feature_{i}" for i in range(48)] + \
               [f"word_freq_{word}" for word in ['make', 'address', 'all', '3d', 'our', 'over', 'remove',
                                                  'internet', 'order', 'mail', 'receive', 'will', 'people',
                                                  'report', 'addresses', 'free', 'business', 'email', 'you',
                                                  'credit', 'your', 'font', '000', 'money', 'hp', 'hpl',
                                                  'george', '650', 'lab', 'labs', 'telnet', '857', 'data',
                                                  '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
                                                  'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
                                                  'conference']] + \
               [f"char_freq_{char}" for char in [';', '(', '[', '!', '$', '#']] + \
               ['capital_run_length_average', 'capital_run_length_longest', 'capital_total', 'is_spam']

try:
    df = pd.read_csv(url, names=column_names)
    print("Dataset loaded successfully!")

    # Check for missing values
    if df.isnull().sum().any():
        print("\nWarning: Missing values found. Handling by dropping rows with NaNs.")
        df.dropna(inplace=True)
        print(f"Shape of DataFrame after dropping NaNs: {df.shape}")
    else:
        print("\nNo missing values found.")

    # Separate features (X) and target (y)
    X = df.drop('is_spam', axis=1)
    y = df['is_spam']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initialize and train a Logistic Regression model
    model = LogisticRegression(solver='liblinear', random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['not spam', 'spam'])

    print("\nModel trained and evaluated!")
    print(f"Accuracy on the test set: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure you have a stable internet connection or download the dataset manually.")
    print("If downloading manually, replace the 'url' variable with the path to your local file.")

Dataset loaded successfully!

Shape of DataFrame after dropping NaNs: (0, 106)
An error occurred: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Please ensure you have a stable internet connection or download the dataset manually.
If downloading manually, replace the 'url' variable with the path to your local file.


Task 2:<br>
Objective: Diagnose whether a tumor is malignant or benign.<br>
Load the Breast Cancer Wisconsin dataset.<br>
Goal: Build a binary classification model to classify tumors.

In [11]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

print("Breast Cancer Wisconsin dataset loaded successfully!")
print(f"Shape of features: {df.drop('target', axis=1).shape}")
print(f"Shape of target: {df['target'].shape}")
print("\nFirst few rows of the dataset:")
print(df.head())

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train a Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['benign', 'malignant'])

print("\nModel trained and evaluated!")
print(f"Accuracy on the test set: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

Breast Cancer Wisconsin dataset loaded successfully!
Shape of features: (569, 30)
Shape of target: (569,)

First few rows of the dataset:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0


Task 3:<br>
Objective: Determine whether a transaction is fraudulent or legitimate.<br>
Use a credit card transaction dataset.<br>
Goal: Classify transactions into "fraudulent" and "legitimate" categories.

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load the Credit Card Fraud Detection dataset
# This dataset is available on Kaggle: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
# You'll need to download 'creditcard.csv' and place it in the same directory as your script,
# or provide the correct path to the file.
try:
    df = pd.read_csv('creditcard.csv')
    print("Credit Card Fraud Detection dataset loaded successfully!")
    print(f"Shape of the dataset: {df.shape}")
    print("\nFirst few rows of the dataset:")
    print(df.head())

    # The 'Class' column is our target variable (1 for fraudulent, 0 for legitimate)
    X = df.drop('Class', axis=1)
    y = df['Class']

    # Scale the 'Amount' and 'Time' features for better model performance
    scaler = StandardScaler()
    X['Amount'] = scaler.fit_transform(X['Amount'].values.reshape(-1, 1))
    X['Time'] = scaler.fit_transform(X['Time'].values.reshape(-1, 1))

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Initialize and train a Logistic Regression model
    # Adjust class_weight due to the imbalanced nature of the dataset
    model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['legitimate', 'fraudulent'])
    confusion = confusion_matrix(y_test, y_pred)

    print("\nModel trained and evaluated!")
    print(f"Accuracy on the test set: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("\nConfusion Matrix:")
    print(confusion)

except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please download the dataset from Kaggle and ensure it's in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")

Error: 'creditcard.csv' not found. Please download the dataset from Kaggle and ensure it's in the correct directory.
