Title: Train a Classification Model (Logistic Regression)

Task 1: Email Spam Detection<br>
Dataset: Use a dataset containing labeled emails as spam or not-spam, with features such as word frequency.

In [1]:
# Write your code here
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset
# Assuming a dataset with columns: 'Email_Text', 'Label' (0 for not-spam, 1 for spam)
# Example dataset (replace with actual dataset)
data = {
    'Email_Text': [
        'Free money now', 'Hello, how are you?', 'Limited time offer', 
        'Meet me at 5pm', 'Congratulations, you won a prize', 'Let\'s catch up soon'
    ],
    'Label': [1, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

# Convert email text into feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Email_Text'])

# Define target variable
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Predictions on Test Set: {y_pred}")


Accuracy: 0.5
Predictions on Test Set: [1 1]


Task 2: Predicting Tumor Malignancy<br>
Dataset: Use a dataset that contains features from tumor datasets such as size and shape indicators.

In [2]:
# Write your code here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset
# Assuming a dataset with columns: 'Size', 'Shape', 'Malignancy' (0 for benign, 1 for malignant)
# Example dataset (replace with actual dataset)
data = {
    'Size': [15.5, 22.1, 18.7, 25.4, 30.2, 18.9, 28.5, 16.7, 21.2, 23.3],
    'Shape': [1.2, 1.5, 1.1, 1.7, 1.8, 1.3, 1.4, 1.2, 1.6, 1.3],
    'Malignancy': [0, 1, 0, 1, 1, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

# Define features and target variable
X = df[['Size', 'Shape']]
y = df['Malignancy']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Predictions on Test Set: {y_pred}")


Accuracy: 0.0
Predictions on Test Set: [0 0]


Task 3: Wine Quality Classification<br>
Dataset: Use a dataset that contains chemical properties of wine along with a quality rating.
Treat quality as a binary classification (high vs. low).

In [3]:
# Write your code here
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset
# Assuming a dataset with columns: 'Alcohol', 'Acidity', 'Sugar', ..., 'Quality' (0 for low, 1 for high quality)
# Example dataset (replace with actual dataset)
data = {
    'Alcohol': [12.5, 11.0, 13.5, 10.0, 12.0, 13.0, 11.5, 13.2, 11.9, 14.0],
    'Acidity': [3.5, 2.9, 3.2, 3.6, 3.1, 3.0, 3.7, 3.3, 3.4, 3.2],
    'Sugar': [5.0, 4.5, 6.0, 5.2, 4.8, 5.1, 5.3, 5.4, 5.6, 5.7],
    'Quality': [1, 0, 1, 0, 1, 1, 0, 1, 0, 1]  # 1 = high, 0 = low
}

df = pd.DataFrame(data)

# Define features and target variable
X = df[['Alcohol', 'Acidity', 'Sugar']]
y = df['Quality']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Predictions on Test Set: {y_pred}")


Accuracy: 0.0
Predictions on Test Set: [1 1]
