# Overview
This is my attempt to solve the first assignment of the Introduction to Machine Learning course, fall 2022.
## Note
Please make sure the dataset is saved in the same working directory as this notebook.

# Preliminary: imports and Loading data

In [None]:
# main imports needed for the rest of the notebook
import os
import math
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
# setting seaborn to darkgrid for a more detailed display of the values
STYLE = 'darkgrid'
sns.set_style(STYLE)

In [None]:
wd = os.getcwd()
dataset_name = "a1_dataset.csv"
file_path = os.path.join(wd, dataset_name) # setting the location of the file 

In [None]:
df_org = pd.read_csv(file_path) # save original datafrae 
df = df_org.copy() # copy to work on

In [None]:
# let's rename the columns
new_names = {"target": "y"}
y = df['target']
for i in range(1, len(df.columns) + 1):
    new_names[f'var{str(i)}'] = f"f{str(i)}"
df = df.rename(columns=new_names)
print(df.columns)
# I will drop the target column and add it to the dataframe when needed
df.drop('y', inplace=True, axis=1)

# Preprocessing
In this part I preprocess the data for the training phase:
* distinguish between numerical and categorial data
* clean certain columns
* encode categorical features
* impute missing vaues

In [None]:
# this method returns a tuple of the column names : numerical then categorical
def num_cat(df):
    num = df.select_dtypes(np.number).columns.values
    cat = df.select_dtypes(['object', 'category']).columns.values
    return num, cat

num_cols, cat_cols = num_cat(df)

## Cleaning data
Let's start cleaning by the fixing the 'var7' column.

In [None]:
# let's set the last column to datetime for further manipulation
try:    
    df['f7'] = pd.to_datetime(df['f7']) 
except:
    print("Certain dates are semantically invalid")
    
from dateutil import parser

# for futher manipulation we need to determine the invalid dates
def validate_dates(row):
    try:
        row['valid_date'] = parser.parse(row['f7']) # if the data isinvalid an error will raise,
    except ValueError:
        row['valid_date'] = False # the except block will catch it and set the field to False
    return row

df = df.apply(validate_dates, axis=1)
invalid_dates = df[df['valid_date'] == False]['f7'].values
# drop the additional column
df.drop('valid_date', axis=1, inplace=True)

In [None]:
print(invalid_dates) # this is the list of invalid dates in the dataframes
# let's reduce these dates by 24 hours

def fix_dates(row):
    if row['f7'] in invalid_dates:
        date, time = row['f7'].split()
        # change the 29 to 28
        date = date[:-2] + "28"
        row['f7'] = date + " " + time
    return row

df = df.apply(fix_dates, axis=1)

df['f7'] = pd.to_datetime(df['f7'])

In [None]:
# all the missing data has the sa
print(df.dtypes)
# now that the 7th column is converted to datetime, we can futher break it down and tackle each component of the date: year, month, day, time
year = 'year'
month = 'month'
day = 'day'
time = 'time'
date_cols = [year, month, day, time]
def decompose_date(row):
    row[year] = row['f7'].year
    row[month] = row['f7'].month
    row[day] = row['f7'].day
    row[time] = row['f7'].time
    return row

df = df.apply(decompose_date, axis=1)

In [None]:
for c in date_cols[:-1]: # the time column has a significantly large number of unique values.
    df_c = df.copy()
    df_c['y'] = y
    fig = sns.catplot(data=df_c, kind='count', x=c, col='y', col_order=[0, 1])
    fig.set(xlabel=c, ylabel='count')
    plt.xticks(rotation=45)
    plt.show()

These are my observations:
* there is only one year present in all dates: 2019. The year can be dropped then
* The dataset is clearly not a time-series dataset where each second, minute or even hour is important, so the time part can be dropped as well

There are two main options left:
1. keep the date as month + day.
2. reduce the date to the month value.

In [None]:
f7_drop = ['f7', 'time', 'year']
for t in f7_drop:
    df.drop(t, axis=1, inplace=True)

In [None]:
# let's investigate the first option: keep the date as day and month 
def set_date(row):
    row['date'] = pd.Timestamp(year=2019, month=row[month], day=row[day])
    return row

df = df.apply(set_date, axis=1) 
# sort the dataframe by date
df = df.sort_values(by='date', ascending=True)
print(df)

In [None]:
# time to consider the different interactions between the date and the rest of the columns
num_cols, cat_cols = num_cat(df)

for col in num_cols[:-2]:
    col_by_date = pd.pivot_table(df, index='date', values=col, aggfunc=['count', 'mean', 'median'])
    g = sns.relplot(kind='scatter', x=col_by_date.index.values, y=col_by_date[('count', col)].values)
    g.fig.suptitle(f"variation of {col} by date")
    g.set(xlabel='date', ylabel=f'mean of {col}')
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
# it might be more efficient to consider the day of the year
def set_day_of_year(row):
    date = pd.Timestamp(year=2019, month=row[month], day=row[day])
    row['day_of_year'] = (date - pd.Timestamp(year=2019, month=1, day=1)).days + 1
    return row

df = df.apply(set_day_of_year, axis=1)

In [None]:
print(df[['date', 'day_of_year']])

In [None]:
df['y'] = y.copy()
num_cols, cat_cols = num_cat(df)

for col in num_cols:
    g = sns.relplot(data=df, x='day_of_year', y=col)
    g.fig.suptitle(f"variation of {col} by day_of_year")
    g.set(xlabel='date', ylabel=f'{col}')
    plt.xticks(rotation = 90)
    plt.show()
df.drop('y', axis=1, inplace=True)

In [None]:
# the visualizations do not show any trend in the interacation between day_of_year and any other numerical feature or the target variable
# let's confirm that by calculating the correlation
df['y'] = y.copy()
print(df.corr()['day_of_year'])
df.drop('y', axis=1, inplace=True)
# as we can see the correlation is below 0.02

Given the provided evidence we can claim that the first option might not  be suitable for our data. Let's consider the second one.

In [None]:
day_cols = ['day', 'day_of_year', 'date']
for d in day_cols: df.drop(d, axis=1, inplace=True)

In [None]:
df['y'] = y.copy()
print(df.corr()['month'])
df.drop('y' ,axis=1, inplace=True)

As demonstrated here the does not provide much information about the final classification. This column might not be useful for predicting the final target variable. However, we can investigate its effect when imputing the missing continous values: 'f4'.

## Encoding Categorical variables
It is imperative to convert the categorical variables to numerical representations before feeding them to machine learning models
### Encoding f6
Ordinal Encoder is a perfect encoding technique for f6 as the values "yes" and "no" are ordered. The order of the numerical representations does not matter as the machine learning algorithm can assign either positive or negative signs correcting the order proposed by the encoder.

In [None]:
# let's encode f6 as no:0 and yes:1
print(df['f6'].value_counts())
# the column has only 2 values "yes" and "no" as suggested above.
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder() # default parameters: as no customized order can be deduced from the data, the ordering is left up to the encoder
X = oe.fit_transform(df[['f6']]) # create a new dataframe where the column f6 is encoded
# create the final dataframe
df = pd.concat([df.drop('f6', axis=1), pd.DataFrame(X, columns=['f6'])], axis=1)
print(df.isna().sum())

In [None]:
print(df['f4'].describe())

# Encoding f3: the area column:
In this section, we will experiement with two different encodings for the f3 column. As No order can be imposed on countries (the general context of the data is missing and countries appear twice with different classes and numerical features), among the possible encodings, I will consider: 
1. OneHotEncoding
2. TargetEncoding

In [None]:
# before proceeding with the encoding, it is recommended to clean the data by normalizing the string representation and removing any possible unwanted characetrs
def clean_country(row):
    row['f3'] = row['f3'].strip().lower()
    # remove any string between parentheses if they exist
    row['f3'] = re.sub('\(.*\)', "", row['f3'])
    # remove any string between brackets if they exist
    row['f3'] = re.sub('\[.*\]', "", row['f3'])
    return row

df = df.apply(clean_country, axis=1)

In [None]:
## ONE HOT ENCODING
df_OHE = df.copy()
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

X = ohe.fit_transform(df_OHE[['f3']])
df_OHE = pd.concat([df_OHE.drop('f3', axis=1), pd.DataFrame(X, columns=ohe.get_feature_names())], axis=1).fillna(0)


In [None]:
# let's create a function that would divide a dataframe the complete and to-impute parts, divide the complete parts into training and test datasets
RANDOM_STATE = 11
from sklearn.model_selection import train_test_split
def divide_f4(df):
    df_f4 = df[~df['f4'].isna()]
    df_imp = df[df['f4'].isna()]
    f4 = df_f4['f4'].copy()
    df_f4.drop('f4', axis=1, inplace=True)
    
    
    df, df_t, y, y_t = train_test_split(df_f4, f4, test_size=0.2, random_state=RANDOM_STATE)
    print(y.describe())
    return df, df_t, y, y_t, df_imp
        

### Encoding f3 for predicting f4

* One Hot Encoding: Applying this encoding will lead to a dataset with a signficantly large number of feature and a relatively small number of samples
*  Ordinal Encoding: there is no reason to believe that one area is by any computational mean can be ordered before or after another area.  

The third option considered here is ***Target Encoding***. Each category will represent a numerical value that embeds some knowledge of the target in question. In this imputing we are considering the ***f4*** as a target.
Each area is replaced by the ***mean*** of f4 values associated with it. If an area is seen for the first time then it is replaced with the general median of all f4 values.

In [None]:
df_TE = df.copy()

df_TE, df_TE_t, y_TE, y_TE_t, df_TE_imp = divide_f4(df_TE)

In [None]:

df_TE['f4'] = y_TE
f4_by_area = pd.pivot_table(df_TE, index='f3', values='f4', aggfunc=['count', 'mean'])
df_TE.drop('f4', axis=1, inplace=True)
print(f4_by_area)

def f3_encode_f4(row):
    area = row['f3']
    # if the area encountered is included in the training data
    if area  in f4_by_area.index.values:
        row['f3'] = f4_by_area[('mean', 'f4')][area] # replace the country by the mean of the values of f4 associated with it
    else: # if the area is encountered is not included in the training data, replace it with the general mean of f4
        row['f3'] = y_TE.mean()
    return row

df_TE= df_TE.apply(f3_encode_f4, axis=1) # we have a training set where f3 is target encoded
df_TE_t = df_TE_t.apply(f3_encode_f4, axis=1) # a test set where f3 is target encoded.
df_TE_imp = df_TE_imp.apply(f3_encode_f4, axis=1)
print(df_TE['f3'])
print(df_TE_t['f3'])
print(df_TE_imp['f3'])


## Imputing f4
As the different encoding methods were considered, it is now time to impute the missing values using a Regression models

In [None]:
# prepare cross validation
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
CV = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_data(df_4, df_4_t):
    # scale the training data
    scaler = StandardScaler()
    Xs = scaler.fit_transform(df_4)
    df = pd.DataFrame(Xs, columns=df_4.columns)

    # scale the test data
    Xs = scaler.transform(df_4_t)
    df_t = pd.DataFrame(Xs, columns=df_4_t.columns)
    return df, df_t 

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor as knr

SCORING = "neg_mean_squared_error"


def best_ridge(df: pd.DataFrame, y:np.array ):
    """Given a training and dataset, it will return the Ridge model with the lowest mean squared error on cross validation

    Args:
        df_4 (pd.DataFrame): the training data
        y_f4 (np.array): the training target values

    Returns:
    Ridge estimator with the alpha hyperparameter tuned
    """
    global CV, SCORING
    X_train = df.values
    
    ridge = Ridge()
    parameters = {"alpha": [10 ** i for i in range(-3, 3)]}

    ridge_search = GridSearchCV(ridge, parameters, cv=CV, scoring=SCORING, n_jobs=-1)

    ridge_search.fit(X_train, y)

    return ridge_search.best_estimator_, - ridge_search.best_score_

def best_lasso(df: pd.DataFrame, y: np.array):
    """Given a training and dataset, it will return the Lasso model with the lowest mean squared error on cross validation

    Args:
        df_4 (pd.DataFrame): the training data
        y_f4 (np.array): the training target values

    Returns:
    Lasso estimator with the alpha hyperparameter tuned.
    """
    global CV, SCORING 
    X_train = df.values

    lasso = Lasso()
    
    parameters = {"alpha": [10 ** i for i in range(-3, 3)]}

    lasso_search = GridSearchCV(lasso, parameters, cv=CV, scoring=SCORING, n_jobs=-1)
    lasso_search.fit(X_train, y)


    return lasso_search.best_estimator_, - lasso_search.best_score_

def best_knr(df, y):
    """Given a training and dataset, it will return the KNearestNeighbors Regressor with the lowest mean squared error on cross validation

    Args:
        df_4 (pd.DataFrame): the training data
        y_f4 (np.array): the training target values

    Returns:
    KNearestNeighbors regressor with the "K" hyperparameter tuned.
    """

    X_train = df.values

    k_reg = knr() # a default model

    # there are two main parameters to tune: number of neighbors and the type of distance

    k_reg_params = {"n_neighbors": list(range(1, 15)), "weights":['uniform', 'distance']}

    k_reg_search = GridSearchCV(k_reg, k_reg_params, cv=CV, scoring=SCORING)

    k_reg_search.fit(X_train, y)

    return k_reg_search.best_estimator_, -k_reg_search.best_score_ 

In [None]:
from sklearn.preprocessing import PolynomialFeatures

def best_poly_features(df, y):
    """given the training dataset and the corresponding target values, it returns the degree for which POLYNOMIAL REGRESSION 
    performs the best

    Args:
        df_4 (DataFrame): training data
        yf4 (Series): training target values
    """
    global CV
    
    X_t = df.values
    y_t = y.values

    polys = [PolynomialFeatures(degree=i) for i in range(2, 6)]
    X_trains = [p.fit_transform(X_t) for p in polys]    
    
    # intiate a Linear Regression model

    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()

    best_score = 10 ** 9
    best_deg = 0
    scoring = "neg_mean_squared_error"
    for i in range(len(X_trains)): 
        score = -np.mean(cross_val_score(lr, X_trains[i], y_t, cv=CV, scoring=SCORING))
        
        print(f"degree: {str(i + 2)}" )
        print(f"score: {str(np.mean(score))}")
        if best_score > score:
            best_score = score
            best_deg = i + 2
    
    return best_score, best_deg, polys[best_deg - 2]
    # print(best_deg)
    # print(best_score)

    # X_train = X_trains[best_deg - 2] # set the training data
    # X_test = polys[best_deg - 2].transform(df_4_test.values) # set the test data    


def best_poly_reg(df, y):
    _,_, poly = best_poly_features(df, y)
    X_train = df.values
    # apply the best polynomial features on the training data
    X_train = poly.transform(X_train)
    
    # find the best ridge estimator with this training data
    ridge_est, ridge_score = best_ridge(pd.DataFrame(X_train), y)
    # find the best lasso estimator with this training data
    lasso_est, lasso_score = best_lasso(pd.DataFrame(X_train), y)
    
    if ridge_score > lasso_score:
        return lasso_est
    return ridge_est
    
    


In [None]:

# given a training and test dataset for prediting f4, this function will return the model that performs better the test dataset.
def best_imputation_model(df_4, df_4_t, y_f4, y_f4_t, use_poly=True): 
    global CV
    df, df_t = scale_data(df_4, df_4_t)
    # we will find the best possible Ridge, Lasso, Polynomial Regularized Regression and the best KNN-R
    # each of them will be tested on the test dataset and the one with best score will be returned
    if use_poly:
        poly = best_poly_reg(df, y_f4)
    
    ridge, _ = best_ridge(df, y_f4)
    lasso, _ = best_lasso(df, y_f4)
    knn, _ = best_knr(df, y_f4)

    X_train = df.values
    X_test = df_t.values
    models = [ridge, lasso, knn]
    if use_poly:
        models.append(poly)
    
    best_score = 10 ** 9
    best_model = None
    
    for m in models:
        m.fit(X_train, y_f4) 
        y_pred = m.predict(X_test)
        score = (mean_squared_error(y_pred, y_f4_t))
        print(f"model: {str(m)}")
        print(f"mean squared error {str(score)}")
        if score < best_score:
            best_model = m
            best_score = score
            
    return best_model, best_score   


In [None]:
# let's divide the data one hot encoded the same way the target-encoded data is
y_HE = df_OHE['f4'][y_TE.index.values]
y_HE_t = df_OHE['f4'][y_TE_t.index.values]
df_OHE.drop('f4', axis=1, inplace=True)
df_HE = df_OHE.loc[df_TE.index.values, :]
df_HE_t = df_OHE.loc[df_TE_t.index.values, :]
df_TE_imp = df_OHE.loc[df_TE_imp.index.values, :]

print((y_HE == y_TE).all()) 
print((y_HE_t == y_TE_t).all()) 

# the train, test, impute splitting is the same for both OHE and Target encoded data

In [None]:
# let's find the best imputation models for both types of encoding:
impute_TE = best_imputation_model(df_TE, df_TE_t, y_TE, y_TE_t)
impute_HE = best_imputation_model(df_HE, df_HE_t, y_HE, y_HE_t, use_poly=False)

print(impute_TE[1])
print(impute_HE[1])