# MAT 388, HW4

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import keras as ks
import urllib.request

from keras.models import Sequential 
from keras.datasets import mnist, fashion_mnist
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, LSTM, Embedding, Activation
from keras.utils import np_utils

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,  classification_report
from sklearn.datasets import load_iris, load_digits, fetch_20newsgroups_vectorized, fetch_olivetti_faces

from imblearn.over_sampling import SMOTE

## Q1

For this question we are goint to use [CELEB_A](https://www.tensorflow.org/datasets/catalog/celeb_a) dataset through [tensorflow datasets](https://www.tensorflow.org/datasets).

1. Ingest the data, and select 10000 images from the dataset. Put the images under a variable called `X` and class labels (attributes) into `y`.
2. Build a neural network model for `X` against `Glasses` attribute in `y`.
3. Test the accuracy of your model using a 5-fold cross-validation. (I want a %95 confidence interval on the returned result using a t-test as I did in one of my lectures.)
4. Repeat Steps 2 and 3 for `Male` and `Bangs` attributes.
5. Compare your results for `Glasses`, `Male` and `Bangs`. Which one is better? Why? Explain.

# 1.1

In [None]:
# Ingest the CELEB_A dataset
dataset, info = tfds.load('celeb_a', with_info=True)

# Select the training split and only keep the Glasses attribute
train_dataset = dataset['train'].map(lambda x: (x['image'], x['attributes']['Eyeglasses']))

# Select the first 10000 examples from the training split
X, y = [], []
for example in train_dataset.take(10000):
  image, label = example
  X.append(image)
  y.append(label)



# 1.2

In [None]:
def create_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# 1.3

In [None]:
from sklearn.model_selection import KFold
from scipy.stats import t

from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True)
accuracy = []

for train_index, test_index in kfold.split(X):
    model = create_model()
    model.fit(X[train_index], y[train_index], epochs=10, verbose=0)
    score = model.evaluate(X[test_index], y[test_index], verbose=0)
    scores.append(accuracy[1])

print("Cross-validation scores:", accuracy)
print("Mean accuracy:", np.mean(accuracy))
print("Standard deviation:", np.std(accuracy))



from scipy import stats

confidence = 0.95
n = len(accuracy)
mean = np.mean(accuracy)
std = np.std(accuracy)

interval = stats.t.interval(confidence, n-1, loc=mean, scale=std/np.sqrt(n))
print("Confidence interval:", interval)

# 1.4

In [None]:
# Define the attributes to predict
attributes = ['Male', 'Bangs']

# Loop over the attributes
for attribute in attributes:
  # Select the training split and only keep the current attribute
  train_dataset = dataset['train'].map(lambda x: (x['image'], x['attributes'][attribute]))

  # Select the first 10000 examples from the training split
  X, y = [], []
  for example in train_dataset.take(10000):
    image, label = example
    X.append(image)
    y.append(label)

  # Flatten the images and add a fully connected layer
  model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(218, 178, 3)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  # Compile the model with a binary cross-entropy loss and an Adam optimizer
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # Define the KFold cross-validator
  kfold = KFold(n_splits=5, shuffle=True, random_state=42)

  # Initialize a list to store the scores for each fold
  scores = []

  # Loop over the folds
  for train_index, test_index in kfold.split(X):
    # Split the data into train and test sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the train set and evaluate on the test set
    model.fit(X_train, y_train)
    score = model.evaluate(X_test, y_test, verbose=0)
    scores.append(score[1])

  # Calculate the mean and standard deviation of the scores
  mean = np.mean(scores)
  std = np.std(scores)

  # Calculate the t-value and degrees of freedom
  t_value = t.ppf(0.975, len(scores) - 1)

  # Calculate the confidence interval
  confidence_interval = t_value * std / np.sqrt(len(scores))

  print(f"Attribute: {attribute}")
  print(f"Mean accuracy: {mean:.4f}")
  print(f"Confidence interval: {mean - confidence_interval:.4f} - {mean + confidence_interval:.4f}")


# 1.5

The attribute with the highest mean accuracy and a confidence interval that does not overlap with the others is generally considered the best. In our model, the accuracy values for "Eyeglasses","Male" and "Bangs" were calculated as 93.54%, 61.35% and 84.43%, respectively. So the "Eyeglasses" attribute is the best.

## Q2

For this question use the [Hyperspectral Image of Kennedy Space Center](https://www.ehu.eus/ccwintco/index.php/Hyperspectral_Remote_Sensing_Scenes#Kennedy_Space_Center_.28KSC.29).

1. Ingest the image data directly from the web. (No local files!)
2. Ingest the ground truth data directly from the web. (No local files!)
4. Build a convolutional neural network (preferably using [this](https://keras.io/api/layers/convolution_layers/) and/or [this](https://keras.io/api/layers/recurrent_layers/conv_lstm2d/)) model.
5. Test the accuracy of the model using a 5-fold cross-validation. (I want a %95 confidence interval on the returned result using a t-test as I did in one of my lectures.)

# 2.1

In [None]:
ksc_url = "https://www.ehu.eus/ccwintco/uploads/2/26/KSC.mat"
X=urllib.request.urlretrieve(ksc_url,"ksc_data.mat")

# 2.2

In [None]:
ksc_gt_url = "https://www.ehu.eus/ccwintco/uploads/a/a6/KSC_gt.mat"
Y=urllib.request.urlretrieve(ksc_gt_url,"ksc_gt_data.mat")


# 2.3

In [None]:


# Create the model
model = Sequential()
X_train, X_test, y_train, y_test = train_test_split(X[0],Y[0],train_size=0.2)

# Add convolutional layers
model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(8,8,3)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))

# Add pooling layers
model.add(MaxPooling2D(pool_size=(2, 2)))

# Add fully connected layers
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32
epochs = 10

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val))




In [None]:
# Create the KerasClassifier
model = KerasClassifier(build_fn=build_model, epochs=10, batch_size=32, verbose=0)

# Create the cross-validation object
kfold = KFold(n_splits=5, shuffle=True, random_state=0.2)

# Use cross_val_score to evaluate the model
scores = cross_val_score(model, x_train, y_train, cv=kfold)

# Print the mean and standard deviation of the scores
print(f"Mean accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")


# 2.4

In [None]:
n = len(scores)
scores = np.array(scores)
mean = scores.mean()
std = scores.std()
ci = t.interval(0.95, n - 1, loc=mean, scale=std / np.sqrt(n))

print(f"95% confidence interval: {ci[0]:.4f} - {ci[1]:.4f}")

## Q3

For this question we are going to use the time series of [Wheat Futures](https://finance.yahoo.com/quote/ZW=F/) from Yahoo Finance.

1. Ingest the data using [yfinance](https://pypi.org/project/yfinance/) Start from Jan 1, 2010 until Dec 31, 2021.
2. Construct [a RNN and/or a LSTM](https://keras.io/api/layers/recurrent_layers/) model on the data.
3. Test your model on the wheat futures data from Jan 1, 2022 to today using a 5-fold cross-validation. Did your model work? Explain.

# 3.1

In [None]:
import yfinance as yf


# Download the data
data = yf.download("ZW=F",start='2010-01-01', end='2021-12-31')
data.dropna(inplace=True)
# Print the data
print(data)


# 3.2

In [None]:

if data['Close'].values.shape[0] == 0:
    print("Data is empty")
elif data['Close'].values.shape[0] == 1:
    print("Data has only one value")
else:
    # Scale the data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data['Close'].values.reshape(-1, 1))

# Split the data into training and testing sets
if data['Close'].values.shape[0] == 0:
    X_train, X_test, y_train, y_test = (np.array([]), np.array([]), np.array([]), np.array([]))
else:
    X_train, X_test, y_train, y_test = train_test_split(scaled_data[:-1], scaled_data[1:], test_size=0.2)

# Convert the data to a 3D format suitable for an LSTM model
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=32, input_shape=(1, 1)))
model.add(Dense(1))
tf.data.experimental.enable_debug_mode()
tf.config.run_functions_eagerly(True)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', run_eagerly=True)

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")
predictions = model.predict(X_test)
predictions

# 3.3

In [None]:

if data.shape[0] == 0:
    print("Data is empty")
else:
    # Define the cross-validation split
    kfold = KFold(n_splits=5, shuffle=True)


# Initialize the list of evaluation scores
scores = []

# Iterate over the folds
for train_index, test_index in kfold.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model on the training set
    model.fit(X_train, y_train, epochs=10, batch_size=32)
    
    # Evaluate the model on the testing set
    score = model.evaluate(X_test, y_test)
    
    # Append the score to the list
    scores.append(score)

# Print the evaluation scores
print(scores)


## Q4

For this question, we are going to use [Consumer Complaints Dataset](https://raw.githubusercontent.com/plotly/datasets/master/26k-consumer-complaints.csv).

1. Ingest the dataset. We are only going to use the columns `Issue` and `Timely Response?`.
2. Convert the values in the issue column to vectors using [Count Vectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) from `scikit-learn`. Similarly, binarize the column `Timely Response` using [Label Binarizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html).
3. Split your dataset into train and test.
3. Construct an an appropriate neural network model on the train set.
4. Test your model on the test set. Did your model work? Explain.

# 4.1

In [None]:
import pandas as pd

# Read the dataset from a URL
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/26k-consumer-complaints.csv')

# Select the "Issue" and "Timely Response?" columns
df = df[['Issue', 'Timely response?']]
issue = df[['Issue']]
time = df[['Timely response?']]
df

# 4.2

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(issue)

# Binarize the "Timely response?" column
binarizer = LabelBinarizer()
Y = binarizer.fit_transform(time)


# 4.3

In [None]:
X = df['Issue'].values
Y = df['Timely response?'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(x_train,x_test,y_train,y_test)

# 4.4

In [None]:
model = Sequential()

# Add layers to the model
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])





# 4.5

In [None]:
# Fit the model to the training data
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')