## PART 1
#### Fit A Logistic Regression Model 

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [6]:
# Load the Smarket dataset (replace 'your_dataset.csv' with the actual file path)
smarket_data = pd.read_csv('Smarket.csv')

In [7]:
smarket_data 

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [8]:
# Create feature matrix X and target variable y
X = smarket_data[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
y = smarket_data['Direction']

In [9]:
# Fit logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X, y)

In [10]:
# Print model coefficients, standard errors, test-statistics, p-values
coefficients = pd.DataFrame(logreg_model.coef_, columns=X.columns, index=['Coefficients'])
print(coefficients)

                  Lag1      Lag2      Lag3      Lag4      Lag5    Volume
Coefficients -0.072843 -0.042235  0.011016  0.009284  0.010264  0.132112


#### Association between Lag1 and Direction


In [11]:
# Check association between Lag1 and Direction
lag1_coefficient = coefficients['Lag1']['Coefficients']
print(f"Coefficient for Lag1: {lag1_coefficient}")

Coefficient for Lag1: -0.07284345474448234


#### Predicted Probabilities for the First Ten Observations


In [12]:
# Display predicted probabilities for the first ten observations
predicted_probabilities = logreg_model.predict_proba(X)[:10]
print("Predicted Probabilities for the First Ten Observations:")
print(predicted_probabilities)

Predicted Probabilities for the First Ten Observations:
[[0.49265655 0.50734345]
 [0.51825557 0.48174443]
 [0.51870199 0.48129801]
 [0.48465057 0.51534943]
 [0.48900868 0.51099132]
 [0.49293516 0.50706484]
 [0.50725062 0.49274938]
 [0.49072089 0.50927911]
 [0.48216707 0.51783293]
 [0.51090531 0.48909469]]


#### Convert Predicted Probabilities into Class Labels


In [13]:
# Convert predicted probabilities into class labels
predicted_labels = np.where(predicted_probabilities[:, 1] > 0.5, 'Up', 'Down')
print("Class Labels for the First Ten Observations:")
print(predicted_labels)


Class Labels for the First Ten Observations:
['Up' 'Down' 'Down' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Down']


#### Confusion Matrix and Performance Metrics


In [14]:
# Make predictions on the entire dataset
y_pred = logreg_model.predict(X)

# Create a confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate performance metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, pos_label='Up')
recall = recall_score(y, y_pred, pos_label='Up')
f1 = f1_score(y, y_pred, pos_label='Up')

# Display performance metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Confusion Matrix:
[[144 458]
 [141 507]]
Accuracy: 0.5208
Precision: 0.5254
Recall: 0.7824
F1 Score: 0.6286


### PART 2

#### Split the Data into Training and Test Sets


In [15]:
from sklearn.model_selection import train_test_split

# Assuming X and y are already defined from the previous code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Fit a Gaussian Naive Bayes Model


In [16]:
from sklearn.naive_bayes import GaussianNB

# Fit Gaussian Naive Bayes model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)


#### Get Unique Classes and Prior Probabilities


In [17]:
# Unique classes present in the target variable
classes = naive_bayes_model.classes_
print("Unique Classes:", classes)

# Prior probabilities of each class
prior_probabilities = naive_bayes_model.class_prior_
print("Prior Probabilities:", prior_probabilities)


Unique Classes: ['Down' 'Up']
Prior Probabilities: [0.47 0.53]


#### Mean and Variance of Each Feature for Each Class


In [20]:
# Calculate feature variances using the training data
feature_variances = pd.DataFrame(X_train.var(), columns=['Variance'])
print("Variance of Each Feature in the Training Data:")
print(feature_variances)


Variance of Each Feature in the Training Data:
        Variance
Lag1    1.238739
Lag2    1.245333
Lag3    1.261594
Lag4    1.314809
Lag5    1.311143
Volume  0.129911


In [21]:
# Predict probabilities for each class for the first five samples in the test set
probabilities = naive_bayes_model.predict_proba(X_test[:5])
print("Predicted Probabilities for the First Five Samples in the Test Set:")
print(probabilities)

# Predict labels for the test set
predicted_labels = naive_bayes_model.predict(X_test)


Predicted Probabilities for the First Five Samples in the Test Set:
[[0.44486591 0.55513409]
 [0.45801237 0.54198763]
 [0.78579015 0.21420985]
 [0.46616397 0.53383603]
 [0.57562105 0.42437895]]


In [22]:
# Create a confusion matrix for the test set
conf_matrix_nb = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix for Naive Bayes Classifier:")
print(conf_matrix_nb)

# Calculate performance metrics (accuracy, precision, recall, F1-score)
accuracy_nb = accuracy_score(y_test, predicted_labels)
precision_nb = precision_score(y_test, predicted_labels, pos_label='Up')
recall_nb = recall_score(y_test, predicted_labels, pos_label='Up')
f1_nb = f1_score(y_test, predicted_labels, pos_label='Up')

# Display performance metrics for Naive Bayes Classifier
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall: {recall_nb:.4f}")
print(f"F1 Score: {f1_nb:.4f}")


Confusion Matrix for Naive Bayes Classifier:
[[ 21 111]
 [ 22  96]]
Accuracy: 0.4680
Precision: 0.4638
Recall: 0.8136
F1 Score: 0.5908
