https://drive.google.com/file/d/1kKFB8IPHhAFmo59KrUAgVVU4nSCpgYVC/view?usp=drive_link

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd /content/gdrive/My Drive/CSE4020_ML

/content/gdrive/My Drive/CSE4020_ML


In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Task 1 - Emails Dataset

In [None]:
data = pd.read_csv('email.csv')
data.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,class
0,1,1,1,0,1,1,1
1,1,0,0,1,1,0,0
2,1,0,1,1,0,0,1
3,1,1,0,0,1,0,0
4,1,1,0,1,0,1,1


In [None]:
X = data.drop('class', axis=1).values
y = data['class'].values

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, weights):
    return sigmoid(np.dot(X, weights))

def train(X, y, learning_rate, num_iterations):
    num_features = X.shape[1]
    weights = np.zeros(num_features)

    for _ in range(num_iterations):
        y_pred = predict(X, weights)
        error = y_pred - y
        gradient = np.dot(X.T, error) / len(X)
        weights -= learning_rate * gradient

    return weights

In [None]:
# Train the logistic regression model
learning_rate = 0.1
num_iterations = 1000
weights = train(X, y, learning_rate, num_iterations)

In [None]:
print("Parameter values learnt after training:")
weights

Parameter values learnt after training:


array([ 0.42240117, -0.38713392,  2.82181417, -0.30363687, -3.66854201,
        3.86254666])

In [None]:
X_test = np.array([
    [0, 1, 0, 0, 0],
    [1, 1, 1, 0, 1],
    [0, 1, 1, 0, 0],
    [1, 0, 1, 0, 0],
    [0, 1, 0, 0, 1],
    [0, 0, 0, 1, 1],
    [0, 1, 0, 1, 1],
    [0, 0, 0, 0, 1]
])

y_test = np.array([1, 1, 1, 1, 0, 0, 0, 0])

# Add a column of ones to the test data
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

In [None]:
def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

In [None]:
y_pred = np.round(predict(X_test, weights))
print(y_pred)
test_accuracy = accuracy(y_test, y_pred)
test_accuracy

[1. 1. 1. 0. 1. 1. 1. 1.]


0.375

* SKLEARN IMPLEMENTATION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
clf = LogisticRegression()
clf.fit(X, y)

print("Parameter values learnt after training:")
print(clf.coef_)
print(clf.intercept_)

Parameter values learnt after training:
[[-0.05257611 -0.09600716  1.04681562 -0.09599496 -1.36447313  1.36446904]]
[0.20807695]


In [None]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.375

# Task 2 - Heart Dataset

In [None]:
data = pd.read_csv('heart.csv')
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [None]:
# Replace null values with the mean of each column
# data.fillna(data.mean(), inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)

# Convert the imputed data back to a DataFrame
data_imputed = pd.DataFrame(data_imputed, columns=data.columns)

# Verify if there are any remaining null values
print(data_imputed.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [None]:
X = data_imputed.drop('TenYearCHD', axis=1)
y = data_imputed['TenYearCHD']
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
X_norm = pd.DataFrame(X_norm, columns=X.columns)
X_norm

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122


Concatenate the normalized features and target variable

```
normalized_data = pd.concat([X_normalized, y], axis=1)
```

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

print("Parameter values learnt after training:")
print(clf.coef_)
print(clf.intercept_)

Parameter values learnt after training:
[[-0.00608848 -0.83891308  0.87743232 -0.31825342 -0.46437816 -0.06299965
   0.14045234  0.65554293 -0.4176307  -0.78697974  0.34923191 -0.84335586
  -0.67434124]]
[-0.15210384]


In [None]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0], dtype=int64)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7951219512195122