# Exercise: Tabular data

**PREDICT THE TEMPERATURE**<br>

1. Load the dataset weatherHistory_bis.csv using pandas<br>

2. Use seaborn or matplotlib to visualize the relationship between variables and the target Temperature.<br>

3. Normalize the data using MinMaxScaler<br>

4. Split the dataset into train and test sets<br>

5. Create your own model to predict the Temperature using mean_square_error for loss and adam for optimizer (make sure to save history in fit)

6. Visualize the curve of loss and val_loss

7. Evaluate the model on the test dataset using loss (mse) and $r^2$ (determination coefficient).

8. Visualize the difference between Temperature predictions and right tenmperatures on the test dataset

9. Create a function to predict the temperature on a given features

# Exercise: Movie review classification

**Movie review classification**

In this exercise, we will use a dataset in tensorflow-datasets (tfds) and a pretrained model to encode our texts in mdb_reviews.<br>

1. Import the necessary packages (tensorflow, tensorflow-datasets and tensorflow_hubs) <br>

2. Load the dataset mdb_reviews with tfds

3. print the 10 first reviews on the dataset

4. Load the pretrained model with hub on https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1 (don't download the model)

5. Create a model for predicting the review text by using the pretrained model as Input layer and add other layers after.

7. Evaluate the model on test_data

6. Give some reviews (texts) and evaluate your model

In [None]:
# 1. Import the necessary packages (tensorflow, tensorflow-datasets and tensorflow_hubs)
import tensorflow as tf
import tensorflow_datasets as tfds # to use tensorflow prebuilt datasets
import tensorflow_hub as hub # Contains presaved models for transfer learning

In [None]:
# 2. Load the dataset mdb_reviews with tfds
train_data, validation_data, test_data = tfds.load(name="imdb_reviews",split=('train[:60%]', 'train[60%:]', 'test'), as_supervised=True)

In [None]:
# 3. the 10 first reviews:
# Since our result dataset are not of type list or dictionary we will get data as below
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
print('train_examples_batch= %s' % (train_examples_batch))
print('train_labels_batch= %s' % (train_labels_batch))

In [None]:
# 4. Load the pretrained model
pretrained_model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(pretrained_model, input_shape=[], dtype=tf.string, trainable=True)
# hub_layer is or input layer which converts text inti vectors

# Correction Controle Machine Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load the Wine Quality dataset
data = load_wine()
wine_df = pd.DataFrame(data.data, columns=data.feature_names)
wine_df['target'] = data.target

# Task 1: Data Loading and Exploration
# Explore the dataset
print(wine_df.head())
print(wine_df.describe())

# Task 2: Supervised Learning (Classification)
# Split the data into features and target variable
X = wine_df.drop('target', axis=1)
y = wine_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classification model (Random Forest) and train it
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Task 3: Unsupervised Learning (Dimensionality Reduction)
# Perform Principal Component Analysis (PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Visualize the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.show()

# Task 4: Unsupervised Learning (Clustering)
# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
wine_df['cluster'] = kmeans.fit_predict(X_scaled)

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(X_scaled, wine_df['cluster'])
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Task 5: Combined Supervised and Unsupervised Learning
# Add cluster labels as an additional feature to X
X['cluster'] = wine_df['cluster']

# Re-train the classification model with the added feature
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (with cluster feature): {accuracy:.2f}")
print(classification_report(y_test, y_pred))