In [None]:
# Data Preparation and Pre-processing Lab Exercises
---

## Exercise 1: Titanic Data — Data Loading & Inspection

**Objective:** Load a public dataset and inspect its structure.

import pandas as pd

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Preview and inspect
print(df.head())
print(df.dtypes)
print(df.isnull().sum())


In [None]:

---

## Exercise 2: Titanic Data — Simple EDA

**Objective:** Visualize distributions of key features.

import matplotlib.pyplot as plt

# Histogram of passenger ages
plt.hist(df['Age'].dropna(), bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Bar chart of embarked locations
df['Embarked'].value_counts().plot(kind='bar')
plt.title('Embarked Value Counts')
plt.xlabel('Port')
plt.ylabel('Count')
plt.show()


In [None]:

---

## Exercise 3: Bike Sharing — Data Loading & Preview

**Objective:** Work with a zipped dataset, extract and load.

import pandas as pd, zipfile, io, urllib.request

# Download and read day.csv from bike-sharing ZIP
zip_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
resp = urllib.request.urlopen(zip_url)
z = zipfile.ZipFile(io.BytesIO(resp.read()))
df_day = pd.read_csv(z.open("day.csv"))

print(df_day.head())
print(df_day.describe())


In [None]:

---

## Exercise 4: Bike Sharing — Scatter Plot & Correlation

**Objective:** Explore relationship between temperature and rentals.

import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot
sns.scatterplot(x='temp', y='cnt', data=df_day, alpha=0.5)
plt.title('Normalized Temperature vs. Total Rentals')
plt.show()

# Correlation heatmap for numeric features
sns.heatmap(df_day.select_dtypes('number').corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()


In [None]:

---

## Exercise 5: Linear Regression — Sales Prediction

**Objective:** Build a regression model on synthetic sales data.

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Generate synthetic data
np.random.seed(0)
X = np.random.rand(100,1) * 100
y = 5 + 2 * X.flatten() + np.random.randn(100)*10
df_sales = pd.DataFrame({'AdSpend':X.flatten(), 'Sales':y})

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


In [None]:

---

## Exercise 6: Classification — Predicting Survival

**Objective:** Build a classifier on Titanic data.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Prepare data
df_clf = df.dropna(subset=['Age','Embarked'])
X = pd.get_dummies(df_clf[['Pclass','Sex','Age','Embarked']], drop_first=True)
y = df_clf['Survived']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train and evaluate
clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))


In [None]:

---

## Exercise 7: Time Series — ARIMA Forecasting

**Objective:** Fit an ARIMA model on daily bike rentals.

import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error

# Prepare series
ts = df_day.set_index('dteday')['cnt']
train, test = ts[:300], ts[300:]

# Fit ARIMA
model = ARIMA(train, order=(2,1,2)).fit()
forecast = model.predict(start=test.index[0], end=test.index[-1])

# Evaluate
print("MAE:", mean_absolute_error(test, forecast))


In [None]:

---

## Exercise 8: Clustering — Customer Segmentation

**Objective:** Perform K-Means clustering on synthetic customer data.

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Synthetic data
np.random.seed(0)
data = np.vstack([np.random.normal(loc=(i*5,i*5), scale=1, size=(50,2)) for i in range(3)])
df_cust = pd.DataFrame(data, columns=['Feature1','Feature2'])

# K-Means
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_cust)
df_cust['Cluster'] = kmeans.labels_

# Plot clusters
plt.scatter(df_cust['Feature1'], df_cust['Feature2'], c=df_cust['Cluster'], cmap='tab10')
plt.title('K-Means Clustering (k=3)')
plt.show()
