<a href="https://colab.research.google.com/github/azmisaud/MCALab3/blob/main/DataScienceLabQuestions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week11 Question5

## Generating the demo dataset.

In [None]:
import pandas as pd
import numpy as np

In [None]:
num_samples=1000

x1=np.random.choice(['A','B','C','D'],num_samples)
x2=np.random.choice(['W','X','Y','Z'],num_samples)

x3=np.random.normal(loc=50,scale=10,size=num_samples)
x4=np.random.uniform(0,100,size=num_samples)
x5=np.random.normal(loc=30,scale=5,size=num_samples)
x6=np.random.uniform(10,50,size=num_samples)
x7=np.random.uniform(0,1,size=num_samples)

y=np.random.choice([0,1],num_samples)

data=pd.DataFrame({
    'x1':x1,
    'x2':x2,
    'x3':x3,
    'x4':x4,
    'x5':x5,
    'x6':x6,
    'x7':x7,
    'y':y
})

data.to_csv('demo_dataset.csv',index=False)

print("Dataset generated and saved as demo.csv")

## Loading the generated dataset

In [None]:
data_df=pd.read_csv('demo_dataset.csv')

## Cleaning the data

### 1. Handling Missing Values

In [None]:
missing_values=data_df.isnull().sum()
print(missing_values)

#There are no missing values since the data is generated as per the choice.

### Handling Outliers

In [None]:
#Using the IQR method for outlier handling
for col in ['x3','x4','x5','x6']:
  Q1=data_df[col].quantile(0.25)
  Q3=data_df[col].quantile(0.75)
  IQR=Q3-Q1
  lower_bound=Q1-1.5*IQR
  upper_bound=Q3+1.5*IQR
  data_df[col]=np.where(data_df[col]<lower_bound,lower_bound,data_df[col])
  data_df[col]=np.where(data_df[col]>upper_bound,upper_bound,data_df[col])

### Label Encoding the nominal data

Importing the LabelEncoder from sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder=LabelEncoder()

data_df['x1']=label_encoder.fit_transform(data_df['x1'])
data_df['x2']=label_encoder.fit_transform(data_df['x2'])

## Scaling the data

Importing the standard scaler from sklearn

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X=data_df.drop(columns=['y'])
y=data_df['y']

continous_features=['x3','x4','x5','x6']

scaler=StandardScaler()

X_scaled=X.copy()
X_scaled[continous_features]=scaler.fit_transform(X[continous_features])

X_scaled['x7']=X['x7']

final_df=pd.concat([X_scaled, y.reset_index(drop=True)],axis=1)

## Training this dataset

Importing the libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
X=final_df.drop(columns=['y'])
y=final_df['y']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)


### Logistic Regression

In [None]:
log_reg=LogisticRegression(random_state=42)

log_reg.fit(X_train,y_train)

y_pred=log_reg.predict(X_test)

accuracy_reg=accuracy_score(y_test,y_pred)
f1_reg=f1_score(y_test,y_pred)
conf_matrix_reg=confusion_matrix(y_test,y_pred)

print("Accuracy:",accuracy_reg)
print("F1 Score:",f1_reg)
print("Confusion Matrix :\n ", conf_matrix_reg)

### Decision Tree

In [None]:
decision_tree=DecisionTreeClassifier(random_state=42)

decision_tree.fit(X_train,y_train)

y_pred_dt=decision_tree.predict(X_test)

accuracy_dt=accuracy_score(y_test,y_pred_dt)
f1_dt=f1_score(y_test,y_pred_dt)
conf_matrix_dt=confusion_matrix(y_test,y_pred_dt)

print("Accuracy:",accuracy_dt)
print("F1 Score:",f1_dt)
print("Confusion Matrix : \n", conf_matrix_dt)

### Random Forest

In [None]:
random_forest=RandomForestClassifier(random_state=42)

random_forest.fit(X_train,y_train)

y_pred_rf=random_forest.predict(X_test)

accuracy_rf=accuracy_score(y_test,y_pred_rf)
f1_rf=f1_score(y_test,y_pred_rf)
conf_matrix_rf=confusion_matrix(y_test,y_pred_rf)

print("Accuracy:",accuracy_rf)
print("F1 Score:",f1_rf)
print("Confusion Matrix : \n", conf_matrix_rf)

# Week12 Question5

In [None]:
import numpy as np

Generating the random values of independent variables and constant.

In [None]:
np.random.seed(42)

x1=np.random.rand(1000)
x2=np.random.rand(1000)
c=np.random.rand(1000)

Defining the dependent variable.

In [None]:
y=x1**2+3*x2+c

Combining x1 and x2 in a 2D array.

In [None]:
X=np.column_stack((x1,x2))

Generating Polynomial features to train.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly=PolynomialFeatures(degree=2)
X_poly=poly.fit_transform(X)

X_poly has 6 columns [1,x1,x2,x1^2,x1*x2,x2^2]


Splitting the Dataset into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_poly,y,test_size=0.2,random_state=42)

Training the model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model=LinearRegression()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

Calculating Evaluation Metrics

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
r2=r2_score(y_test,y_pred)
print("R2 Score:",r2)

In [None]:
mae=mean_absolute_error(y_test,y_pred)
print("Mean Absolute Error:",mae)

In [None]:
mse=mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",mse)

# Week13 Question5

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.metrics import classification_report, confusion_matrix

Loading the MNIST dataset

In [None]:
(x_train,y_train), (x_test,y_test)=datasets.mnist.load_data()

### Preprocessing the data

Normalizing the images to values between 0 and 1

In [None]:
x_train=x_train.astype('float32')/255.0
x_test=x_test.astype('float32')/255.0

Converting labels to binary

In [None]:
y_train_binary=np.where(y_train==8,1,0)
y_test_binary=np.where(y_test==8,1,0)

Reshaping data to add channel dimension

In [None]:
x_train=x_train.reshape((x_train.shape[0],28,28,1))
x_test=x_test.reshape((x_test.shape[0],28,28,1))

### Building the CNN model

In [None]:
model_cnn=models.Sequential([
    layers.Conv2D(32,(3,3),activation='relu',input_shape=(28,28,1)),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64,(3,3),activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(64,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

### Compiling the model

In [None]:
model_cnn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Training the model

In [None]:
model_cnn.fit(x_train,y_train_binary,epochs=5,batch_size=64,validation_split=0.2)

### Evaluating the model

In [None]:
test_loss, test_acc=model_cnn.evaluate(x_test,y_test_binary)
print("Test Accuracy:",test_acc)

In [None]:
y_pred=(model_cnn.predict(x_test)>0.5).astype('int32')

In [None]:
print(confusion_matrix(y_test_binary,y_pred))
print(classification_report(y_test_binary,y_pred))