# Day 1

# Intro to Python libraries

https://www.projectpro.io/article/top-5-libraries-for-data-science-in-python/196

In [None]:
import pandas as pd  # For loading data into a tabular format
import numpy as np   # For manipulating data
import matplotlib.pyplot as plt  # For graph plotting
import seaborn as sns  # For graph plotting
from sklearn.preprocessing import LabelEncoder  # For label encoding in EDA
from sklearn.utils import resample   # For data sampling in EDA
from sklearn.utils import shuffle   # For shuffling the data

In [None]:
# Load the data 
housing = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
# Print the data
housing.head()

https://developers.google.com/machine-learning/crash-course/california-housing-data-description

# Data Visualization

In [None]:
# Create scatterplot with pyplot
scatter_plt = plt.scatter(x=housing['longitude'], y=housing['latitude'])
plt.show()

In [None]:
# Create scatterplot with seaborn
scatter_sns_1 = sns.scatterplot(x=housing['longitude'], y=housing['latitude'])
plt.show()

In [None]:
# Create scatterplot with seaborn + add HUE parameter
scatter_sns_2 = sns.scatterplot(x='longitude', y='latitude', data=housing, hue='ocean_proximity')
plt.show()

In [None]:
# Add title above the scatterplot
scatter_sns_2 = sns.scatterplot(x='longitude', y='latitude', data=housing, hue='ocean_proximity')
plt.title('California Housing Geography')
plt.show()

In [None]:
# Create boxplot with seaborn
boxplot_sns = sns.boxplot(x='housing_median_age', data=housing)
plt.show()

In [None]:
# Create boxplot with seaborn
boxplot_sns = sns.boxplot(x='total_rooms', data=housing)
plt.show()

In [None]:
# Calculate the number of data per its ocean_proximity
data_barplot = housing['ocean_proximity'].value_counts()

In [None]:
# Print data
data_barplot

In [None]:
# Create barplot / barchart with pyplot
# data_barplot.sort_values().plot(kind='barh', color=['red', 'magenta','black','blue','orange'])
# plt.show()

In [None]:
# Create barplot with seaborn
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)
plt.show()

In [None]:
# Change the figure size
plt.figure(figsize=(8, 6))
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)
plt.show()        

In [None]:
# Add annotation to the barplot
plt.figure(figsize=(8, 6))
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)
for i in range(len(data_barplot)):
    plt.text(i, (data_barplot.values[i] + 100), data_barplot.values[i], horizontalalignment='center')
plt.show()

In [None]:
# Add title, xlabel, and ylabel
plt.figure(figsize=(8, 6))
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)
for i in range(len(data_barplot)):
    plt.text(i, (data_barplot.values[i] + 100), data_barplot.values[i], horizontalalignment='center')
plt.title('Ocean proximity counts')
plt.ylabel('Counts')             
plt.xlabel('Ocean proximity')
# barplot.set(xlabel='seaborn_count', ylabel='ocean_proximity_seaborn')
plt.show()

# Preprocessing Data for EDA and EDA

In [None]:
# Exclude ISLAND in ocean_proximity
housing_filtered = housing[housing['ocean_proximity']!='ISLAND']

# Keep ocean_proximity and its number of data as dictionary
ocean_prox_dict = dict(housing_filtered['ocean_proximity'].value_counts())
ocean_prox_dict

In [None]:
# Get the ocean_proximity with the lowest number of data
min_value = min(ocean_prox_dict.values())

def get_min_key(ocean_prox_dict, min_value):
    min_key = ''
    for k,v in ocean_prox_dict.items():
        if v == min_value:
            min_key = k
    return min_key

min_key = get_min_key(ocean_prox_dict, min_value)
min_key, min_value
# min_key = [k for k, v in ocean_prox_dict.items() if v == min_value]
# min_key

In [None]:
# Perform downsampling to neutralize the number of data for each label

def downsampling(df, target_class, min_key, min_value):
    df_process = df[df[target_class]==min_key]
    for target in df[target_class].unique():
        if target != min_key:
            class_downsample = resample(df[df[target_class]==target], replace=False, n_samples=min_value, random_state=24)
            df_process = pd.concat([df_process, class_downsample])
    return shuffle(df_process).reset_index(drop=True)

housing_downsampled = downsampling(housing_filtered, 'ocean_proximity', min_key, min_value)

In [None]:
# Check for null values
housing_downsampled.info()

In [None]:
# Create boxplot to show total_bedrooms
sns.boxplot(x='total_bedrooms', data=housing_downsampled)

In [None]:
# Calculate total_bedrooms mean
housing_downsampled['total_bedrooms'].mean()

In [None]:
# Calculate total_bedrooms median
housing_downsampled['total_bedrooms'].median()

Note: To fill empty values in numerical attributes, mean and median are widely used. When the attribute has many outliers, median is recommended.

In [None]:
# Fill empty values
housing_fillna = housing_downsampled.copy()
housing_fillna['total_bedrooms'] = housing_fillna['total_bedrooms'].fillna(housing_fillna['total_bedrooms'].median())

In [None]:
# Print dataframe info to check non-null count and dtype
housing_fillna.info()

In [None]:
# Check for duplicates
housing_fillna[housing_fillna.duplicated()==True]

In [None]:
# Label ocean_proximity to numerical representation
le = LabelEncoder()
housing_fillna['ocean_proximity_num'] = le.fit_transform(housing_fillna['ocean_proximity'])

In [None]:
# Check for median_house_value correlation (Regression task)
housing_fillna.corr()['median_house_value']

In [None]:
# Feature engineering
housing_fillna['population_per_households'] = round(housing_fillna['population'] / housing_fillna['households'])

In [None]:
housing_fillna.corr()['median_house_value']

In [None]:
housing_fillna['population_per_bedrooms'] = round(housing_fillna['population'] / housing_fillna['total_bedrooms'])

In [None]:
# Check for correlation
housing_fillna.corr()['median_house_value']

# Day 2

# Preprocessing Data for Modelling

In [None]:
from sklearn.model_selection import train_test_split   # To split train val test set
from sklearn.pipeline import Pipeline      # To create pipeline for data transformation
from sklearn.impute import SimpleImputer     # To impute missing values
from sklearn.preprocessing import StandardScaler     # To standardize the data
from sklearn.metrics import accuracy_score    # To calculate the accuracy

In [None]:
# Classification task
columns = housing_fillna.drop('ocean_proximity', axis=1).columns
classification_target = "ocean_proximity_num"

num_attributes = housing_fillna.drop(['ocean_proximity', 'ocean_proximity_num'], axis=1).columns

In [None]:
columns

In [None]:
# Split data into train. valid, and test set
X_data, y_data = housing_fillna[num_attributes], housing_fillna[classification_target]
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size = 0.3, stratify=y_data, random_state=24)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=24)

In [None]:
# Create pipeline to transform numerical attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [None]:
# Preprocess the data
def preprocess_data_classification(df, num_attributes, num_pipeline, train=True):
    if train:
        num_df = num_pipeline.fit_transform(df[num_attributes])
        preprocess_df = pd.DataFrame(num_df, columns=num_attributes)
    else:
        num_df = num_pipeline.transform(df[num_attributes])
        preprocess_df = pd.DataFrame(num_df, columns=num_attributes)
    return preprocess_df

X_train_classification = preprocess_data_classification(X_train, num_attributes, num_pipeline, train=True)
X_valid_classification = preprocess_data_classification(X_valid, num_attributes, num_pipeline, train=False)
X_test_classification = preprocess_data_classification(X_test, num_attributes, num_pipeline, train=False)

# Machine Learning and Evaluation Part 1 (Classification)

In [None]:
# Train with KNN
from sklearn.neighbors import KNeighborsClassifier

n = 5
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train_classification, y_train)

prediction = knn.predict(X_valid_classification)
print(n, accuracy_score(prediction, y_valid))

In [None]:
# Hyperparameter tuning KNN
from sklearn.neighbors import KNeighborsClassifier

for n in range(4, 8):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_classification, y_train)

    prediction = knn.predict(X_valid_classification)
    print(n, accuracy_score(prediction, y_valid))

In [None]:
# Train with Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=150)
lr.fit(X_train_classification, y_train)

prediction = lr.predict(X_valid_classification)
print(accuracy_score(prediction, y_valid))

In [None]:
# Train with Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=24)
dt.fit(X_train_classification, y_train)

prediction = dt.predict(X_valid_classification)
print(accuracy_score(prediction, y_valid))

In [None]:
# Hyperparameter with Decision Tree
max_depth = range(1, 15)
max_leaf_nodes = range(10, 100, 5)
for depth in max_depth:
    for leaf in max_leaf_nodes:
        dt = DecisionTreeClassifier(max_depth=depth, max_leaf_nodes=leaf, random_state=24)
        dt.fit(X_train_classification, y_train)

        prediction = dt.predict(X_valid_classification)
        print("Depth: {}, Leaf: {}, Acc: {}".format(depth, leaf, accuracy_score(prediction, y_valid)))

In [None]:
dt = DecisionTreeClassifier(max_depth=14, max_leaf_nodes=80, random_state=24)
dt.fit(X_train_classification, y_train)
prediction = dt.predict(X_test_classification)
accuracy_score(prediction, y_test)
# print("Depth: {}, Leaf: {}, Acc: {}".format(depth, leaf, accuracy_score(prediction, y_valid)))

In [None]:
dt = DecisionTreeClassifier(random_state=24)
dt.fit(X_train_classification, y_train)
prediction = dt.predict(X_test_classification)
accuracy_score(prediction, y_test)
# print("Depth: {}, Leaf: {}, Acc: {}".format(depth, leaf, accuracy_score(prediction, y_valid)))

In [None]:
# Train with Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=24)
rf.fit(X_train_classification, y_train)

prediction = rf.predict(X_valid_classification)
print(accuracy_score(prediction, y_valid))

In [None]:
# Hyperparameter tuning with Random Forest
max_depth = range(1, 15)
max_leaf_nodes = range(10, 100, 5)
for depth in max_depth:
    for leaf in max_leaf_nodes:
        rf = RandomForestClassifier(max_depth=depth, max_leaf_nodes=leaf, random_state=24)
        rf.fit(X_train_classification, y_train)

        prediction = rf.predict(X_valid_classification)
        print("Depth: {}, Leaf: {}, Acc: {}".format(depth, leaf, accuracy_score(prediction, y_valid)))

In [None]:
rf = RandomForestClassifier(max_depth=14, max_leaf_nodes=95, random_state=24)
rf.fit(X_train_classification, y_train)
prediction = rf.predict(X_test_classification)
accuracy_score(prediction, y_test)
# print("Depth: {}, Leaf: {}, Acc: {}".format(depth, leaf, accuracy_score(prediction, y_valid)))

# Machine Learning and Evaluation Part 2

In [None]:
housing_fillna

In [None]:
from sklearn.preprocessing import OneHotEncoder     # To represent data in one hot encoding representation
from sklearn.compose import ColumnTransformer       # To combine numerical pipeline with one hot encoder
from sklearn.metrics import mean_absolute_error as mae     # To calculate the mean absolute error

In [None]:
# Regression task
regression_target = "median_house_value"

num_attributes = housing_fillna.drop(['median_house_value', 'ocean_proximity', 'ocean_proximity_num'], axis=1).columns
cat_attributes = ["ocean_proximity"]

In [None]:
# Split data into train. valid, and test set
X_data, y_data = housing_fillna[num_attributes.tolist()+cat_attributes], housing_fillna[regression_target]
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size = 0.3, random_state=24)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state=24)

In [None]:
# Represent data in one hot encoding representation
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit_transform(housing_fillna[['ocean_proximity']]).toarray()
ohe.categories_

In [None]:
# Create pipeline for regression task
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attributes),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attributes)
    ])

In [None]:
# Preprocess the data
def preprocess_data_regression(df, num_attributes, cat_attributes, full_pipeline, train=True):
    cat_attributes_ohe = ['<1H OCEAN', 'INLAND', 'NEAR BAY', 'NEAR OCEAN']
    columns = num_attributes.tolist() + cat_attributes_ohe
    
    if train:
        num_df = full_pipeline.fit_transform(df)
        preprocess_df = pd.DataFrame(num_df, columns=columns)
    else:
        num_df = full_pipeline.transform(df)
        preprocess_df = pd.DataFrame(num_df, columns=columns)
    
    return preprocess_df

X_train_regression = preprocess_data_regression(X_train, num_attributes, cat_attributes, full_pipeline, train=True)
X_valid_regression = preprocess_data_regression(X_valid, num_attributes, cat_attributes, full_pipeline, train=False)
X_test_regression = preprocess_data_regression(X_test, num_attributes, cat_attributes, full_pipeline, train=False)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

n = 5
knn = KNeighborsRegressor(n_neighbors=n)
knn.fit(X_train_regression, y_train)

prediction = knn.predict(X_valid_regression)
print("n: {}, MAE: {}".format(n, mae(prediction, y_valid)))

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_regression, y_train)

prediction = lr.predict(X_valid_regression)
print("MAE: {}".format(mae(prediction, y_valid)))

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=24)
dt.fit(X_train_regression, y_train)

prediction = dt.predict(X_valid_regression)
print("MAE: {}".format(mae(prediction, y_valid)))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=24)
rf.fit(X_train_regression, y_train)

prediction = rf.predict(X_valid_regression)
print("MAE: {}".format(mae(prediction, y_valid)))

In [None]:
prediction = rf.predict(X_test_regression)
print("MAE: {}".format(mae(prediction, y_test)))