## Overview
This is my attempt for the first assignment of the Machine Learning course Fall 2022.

### Note
Please make sure the dataset is saved in the ***same working directory*** as this jupyter notebook

## Preliminary: imports and loading data
In this section, I load the data, import the necessary libraries needed for the rest of the notebook. 

In [None]:
import os
import math
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
# setting seaborn to darkgrid for a more detailed display of the values
STYLE = 'darkgrid'
sns.set_style(STYLE)

In [None]:
wd = os.getcwd()
dataset_name = "a1_dataset.csv"
file_path = os.path.join(wd, dataset_name) # setting the location of the file 

In [None]:
df_org = pd.read_csv(file_path) # save original datafrae 
df = df_org.copy() # copy to work on

## Exploring the data 
In this section, I explore the dataset, prepare it: (data cleaning and preprocessing), analyse a number of aspects and interations in the data before building models for our prediction purposes 
### Preprocessing the data

In [None]:
print(df.shape)
print(df.columns)
df.info() # the only column with missing values is 'var4'

In [None]:
# let's rename the columns
new_names = {"target": "y"}
y = df['target']
for i in range(1, len(df.columns) + 1):
    new_names[f'var{str(i)}'] = f"f{str(i)}"
df = df.rename(columns=new_names)
print(df.columns)
# I will drop the target column and add it to the dataframe when needed
df.drop('y', inplace=True, axis=1)

### Target variable distribution

In [None]:
sns.countplot(x=y)
plt.show()

### Numerical and categorial columns
It is crucial to divide the data into categorical and numerical columns as each column should be treated differently. In this subsection, I perform this task in a systematic way:

In [None]:
# this method returns a tuple of the column names : numerical then categorical
def num_cat(df):
    num = df.select_dtypes(np.number).columns.values
    cat = df.select_dtypes(['object', 'category']).columns.values
    return num, cat


num_cols, cat_cols = num_cat(df)

### Understanding the data: visualization, grouping, descriptive analysis

In [None]:
# this method is short for first impression numerical, where I consider the values and its effect 
def first_imp_num(df, y, col_name):
    df_c = df.dropna(axis=1).copy() # a copy with non Nan values
    f1_fig = sns.relplot(data=df, x=df_c.index.values, y=col_name, col=y, col_order=[0, 1])
    f1_fig.set(xlabel="index", ylabel=col_name)
    plt.show()

for col in num_cols:
    first_imp_num(df, y, col)


In [None]:
# analysing the numerical columns
from empiricaldist import Cdf
    
def cdf_num(df, y, col_name, target=False):
    df_c = df.dropna(axis=0).copy() # take a copy with no nan values in it.
    df_c['y'] = y.copy()
    if not target:
        col_cdf = Cdf.from_seq(df[col_name])
        col_cdf.plot()
        plt.xlabel(f'{col_name}')
        plt.ylabel(f'CDF for {col_name}')
        plt.show()         
    else :    
        cdf_1 = Cdf.from_seq(df_c[df_c['y'] == 1][col_name])
        cdf_0 = Cdf.from_seq(df_c[df_c['y'] == 0][col_name])
        cdf_1.plot(label='y == 1')
        cdf_0.plot(label='y == 0')
        plt.xlabel(f'{col_name}')
        plt.ylabel(f'CDF for {col_name}')
        plt.legend()
        plt.show()        

In [None]:
for col in num_cols:
    cdf_num(df, y, col, target=True)

#### Conclusion: 
considering all numerical features (except for f4), negative examples tend to have high values for such features. The column f4 is not as informative as no clear trend or correlation (linear or non-linear) can be seen. This is possibly the result of the large number of missing values.

### Analysing categorical features
The features f3, f6 and f7 are categorical features. The interacations should be understood in depth.

#### Fixing f7
As the 'f7' includes invalid dates, more preprocessing steps should be taken.

In [None]:
# let's set the last column to datetime for further manipulation
try:    
    df['f7'] = pd.to_datetime(df['f7']) 
except:
    print("Certain dates are semantically invalid")
    
from dateutil import parser

# for futher manipulation we need to determine the invalid dates
def validate_dates(row):
    try:
        row['valid_date'] = parser.parse(row['f7']) # if the data isinvalid an error will raise,
    except ValueError:
        row['valid_date'] = False # the except block will catch it and set the field to False
    return row

df = df.apply(validate_dates, axis=1)
invalid_dates = df[df['valid_date'] == False]['f7'].values
# drop the additional column
df.drop('valid_date', axis=1, inplace=True)

In [None]:
print(invalid_dates) # this is the list of invalid dates in the dataframes
# let's reduce these dates by 24 hours

def fix_dates(row):
    if row['f7'] in invalid_dates:
        date, time = row['f7'].split()
        # change the 29 to 28
        date = date[:-2] + "28"
        row['f7'] = date + " " + time
    return row

df = df.apply(fix_dates, axis=1)

df['f7'] = pd.to_datetime(df['f7'])

In [None]:
print(df.dtypes)
# now that the 7th column is converted to datetime, we can futher break it down and tackle each component of the date: year, month, day, time
year = 'year'
month = 'month'
day = 'day'
time = 'time'
date_cols = [year, month, day, time]
def decompose_date(row):
    row[year] = row['f7'].year
    row[month] = row['f7'].month
    row[day] = row['f7'].day
    row[time] = row['f7'].time
    return row

df = df.apply(decompose_date, axis=1)
print()

In [None]:
for c in date_cols[1:-1]:
    df_c = df.copy()
    df_c['y'] = y
    fig = sns.catplot(data=df_c, kind='count', x=c, col='y', col_order=[0, 1])
    fig.set(xlabel=c, ylabel='count')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# the only year present is 2019. The data concerns countries so time within a single day cannot be significant
# year, day and  time columns are to be dropped.
df = df.drop(year, axis=1)
df = df.drop(time, axis=1)
df = df.drop(day, axis=1)
df = df.drop('f7', axis=1)

In [None]:
# let's consider the number of positively classed examples within each month
df['y'] = y.copy()
print(pd.pivot_table(df, index=month, values='y', aggfunc=['count', 'mean']))
# we can see that the month's value does not affect the class distribution

#### 'f3' Column
As the column 'f3' represents countries' (areas') names, it is recommended to normalize the string representation and remove any unnecessary characters.

In [None]:
def clean_country(row):
    row['f3'] = row['f3'].strip().lower()
    # remove any string between parentheses if they exist
    row['f3'] = re.sub('\(.*\)', "", row['f3'])
    # remove any string between brackets if they exist
    row['f3'] = re.sub('\[.*\]', "", row['f3'])
    return row

df = df.apply(clean_country, axis=1)

In [None]:
# set date with the rest of the columns
def set_date(row):
    row['date'] = pd.Timestamp(year=2019, month=row[month], day=row[day])
    return row

In [None]:
# let's encode f6 as no:0 and yes:1
print(df['f6'].value_counts())
# the column's integrity is verified
encode_dict = {"no":0, "yes":1}
df['f6'] = df['f6'].apply(encode_dict.get)

# for f in num_cols:
#     print(pd.pivot_table(df, index='f3', columns=month, values=f, aggfunc=[np.mean]))
# print(pd.pivot_table(df, index='f3', columns=month, values='f6', aggfunc=[np.mean]))

f1_trend = pd.pivot_table(df, index='f3', columns=month, values='f1', aggfunc=[np.mean])
f2_trend = pd.pivot_table(df, index='f3', columns=month, values='f2', aggfunc=[np.mean])
f4_trend = pd.pivot_table(df, index='f3', columns=month, values='f4', aggfunc=[np.mean])
f5_trend = pd.pivot_table(df, index='f3', columns=month, values='f5', aggfunc=[np.mean])
f6_trend = pd.pivot_table(df, index='f3', columns=month, values='f6', aggfunc=[np.mean])

In [None]:
# let's account the ratio of times each country was classified positively
country_y_ratio = pd.pivot_table(df, index='f3', values='y', aggfunc=['count', 'mean']).sort_values(by=[('mean', 'y'), ('count', 'y')], ascending=[False, False])
print(country_y_ratio.iloc[:50,:]) 


#### f3 Encoding
The dataframe displayed by the cell above represents the basis for an adequate target encoding of the 'f3' feature. The proposed encoding is:
$\begin{align} count \cdot ratio \end{align}$
* count: the country's number of occurrences
* ratio: the ratio of positive classes associated with these occurrences

In [None]:
# # let's investigate the effect of the months and days further.
# f1_trend = pd.pivot_table(df, index='f3', columns=y, values='f1', aggfunc=[np.mean, np.median, 'count'])
# f2_trend = pd.pivot_table(df, index='f3', columns=y, values='f2', aggfunc=[np.mean, np.median,'count'])
# f4_trend = pd.pivot_table(df, index='f3', columns=y, values='f4', aggfunc=[np.mean, np.median, 'count'])
# f5_trend = pd.pivot_table(df, index='f3', columns=y, values='f5', aggfunc=[np.mean, np.median, 'count'])

# print(f1_trend)
# print(f2_trend)
# print(f4_trend)
# print(f5_trend)


#### Month, area(f3) and target
In this section I will study the effect of the combination: month, area, and target class on the different numerical values (except 'f4').
The procedure can be described as follows:
* separate the positive and negative rows into two dataframes
* for each dataframe group the rows by 'f3' and aggregate a given numerical feature on the month column: calculate the mean value at each month by country (area/f3)
* visualize the evolution of the mean with respect to month column


The mean in this case is a representative statistic as there is a less than 3 values for each combination of (country, month)

In [None]:
import random
df_1 = df[df['y'] == 1]
df_0 = df[df['y'] == 0]


def feat_month_country(feat, aggs=None, num_display=2, num_samples=20):
    if aggs is None:
        aggs = [np.mean]
    # visualize positive 
    f_months1 = pd.pivot_table(df_1, index='f3', columns=month, values=feat, aggfunc=[np.mean]) # calculate the mean of the feature accross different months for positive rows
    f_months0 = pd.pivot_table(df_0, index='f3', columns=month, values=feat, aggfunc=[np.mean]) # calculate the mean of the feature accross different months for negative rows

    for i in range(1, num_display + 1):
        c0 = f_months1.index.values.tolist()
        c0_sample = random.sample(c0, min(num_samples, len(c0))) 

        for c in c0_sample:
            g = sns.lineplot(x=range(1, 8), y=f_months1.loc[c,:])
        g.set(xlabel='months', ylabel=feat, label=c)
        g.set_title(f"evolution of {feat} with respect to months for positive areas: plot {str(i)}")
        plt.show()
        
    # visualize negative
    for i in range(1, num_display + 1):
        c0 = f_months0.index.values.tolist()
        c0_sample = random.sample(c0, min(num_samples, len(c0))) 

        for c in c0_sample:
            g = sns.lineplot(x=range(1, 8), y=f_months0.loc[c,:])
        g.set(xlabel='months', ylabel=feat, label=c)
        g.set_title(f"evolution of {feat} with respect to months for negative areas: plot {str(i)}")
        plt.show()
    

In [None]:
cols = ['f1', 'f2', 'f5']
for c in cols:
    feat_month_country(c)

The visualizations display chaotric interaction between the different features grouped by the country ('f3') column and the date column ('f7'). 
Thus, we can safely assume the month + country combination has little to no correlation with the numerical features. 

In [None]:
# print(pd.pivot_table(df, index='f3', columns=month, values='f6', aggfunc=['count']))
# print(pd.pivot_table(df, index='f3', columns=month, values='f6', aggfunc=['mean']))
print(pd.pivot_table(df, index=month, values='f6', aggfunc=['count', 'mean'])) # the month does not correlate with f6.

In [None]:
# let's consider the evolution of different values with respect to the months regardless of the country column
# we can acheive that using box plots
def feat_month(feat, aggs=None):
    if aggs is None:
        aggs = [np.mean]
    f_month = pd.pivot_table(df, index=month, values=feat, aggfunc=aggs)
    for stat in aggs:
        fig = sns.relplot(kind="line", ci=None, data=f_month, x='month', y=f_month[(stat, feat)])
        fig.fig.suptitle(f"{stat}'s variation of {feat} with respect to the month")
    plt.show()

In [None]:
for f in ['f1', 'f2', 'f5']:
    feat_month(f, ['max', 'min', 'mean', 'median'])

#### Conclusion
As we can see from the last two subsections, the date reduced to the month value (as the year is the same across the dataset) does not correlate by any mean with neither the country, target or even the (non-missing) numerical features.  
The only possible use of the 'f7' column is impute the 'f4' missing values (if possible)

#### 'f6' column
It is time to consider the interaction between the categorical feature 'f6' and the target variable

In [None]:
g = sns.catplot(kind='count', ci=None, data=df, x='y', col='f6', col_order=[0, 1])
g.set(xlabel='target')
plt.show()


In [None]:
# the plot above reveals the value of 'f6' has little to no effect the final classification
# let's calcuate the corretation between these two values to solidify this observation
print(df.loc[:, ['f6', 'y']].corr())

## Imputing the f4 column
As the 'f4' column misses 600 values, it is of absolute necessity to fill these missing values. In our case, the imputing method would be building a regression model predicting the missing values. 

In [None]:
# df.drop('y', axis=1, inplace=True)
df_4 = df[~df['f4'].isna()]
print(df_4.shape)
print(df.columns)

### Exploring the data in relation to f4
In this subsection, we are tackling a sub problem. However, it is a still a different problem (a regression to say the least), so it should approached accordingly.


In [None]:
sns.distplot(df_4['f4'])
# we can see that the distribution of 'f4' is not too far from normal distribution.

In [None]:
# let's visualize the relation of 'f4' with the rest of the columns
for col in num_cols[:-2]:
    g = sns.relplot(kind='scatter', data=df_4, x=col, y='f4')
    g.fig.suptitle(f"variation of 'f4' with respect of {col}")
    g.set(xlabel=col, ylabel='f4')
    plt.show()

The visualizations demonstrate that the relations between f4 and the other numerical (continous) features are definitely non-linear. Thus further experiementing is needed to reveal the hidden interactions.

In [None]:
# let's investigate the effect of categorical features 
sns.boxplot(data=df_4, y='f4', x='f6')
plt.show()
sns.boxplot(data=df_4, y='f4', x=month)
plt.show()

Further, the first plot demonstrates that f6 provides almost no information about f4 as the variation of f4 for both categories is roughly the same.

In [None]:
f4_by_area = pd.pivot_table(df_4, index='f3', values='f4', aggfunc=['count', 'max', 'min', 'mean','median'])
# f3 is the most informative predictor so far. 
f4_by_month = pd.pivot_table(df_4, index=month, values='f4', aggfunc=['count', 'median'])
def encode_area(row):
    area = row['f3']
    # encode the area as the mean of f4 for that area
    if area in f4_by_area.index.values:
        row['f3'] = f4_by_area[('mean', 'f4')][area]
    # if the area if seen for the first time: encode it as the median of f4 by corresponding month
    else:
        mo = row[month]
        row['f3'] = f4_by_month[('median', 'f4')][mo]
    return row
print(f4_by_area)
print(f4_by_month)

In [None]:
df_imp = df[df['f4'].isna()]
print(df_imp.shape)

In [None]:
df_4 = df_4.apply(encode_area, axis=1) # encode the f3 column for the complete data
df_imp = df_imp.apply(encode_area, axis=1) # encode the f3 column for the missing data

### Imputing the missing values: different models

In [None]:
# let's divide the training data into training and test data: to evaluate the performance
from sklearn.model_selection import train_test_split
yf4 = df_4['f4']
df_4.drop('f4', axis=1, inplace=True)

df_4, df_test, yf4, y_test = train_test_split(df_4, yf4, test_size=0.2, random_state=11)

In [None]:
# copy the dataframe in question for experimenting
from sklearn.preprocessing import StandardScaler # used to scale the data
scaler = StandardScaler()

df_tmp = df_4.iloc[:, :4]
Xs = scaler.fit_transform(df_tmp)
df_t = pd.DataFrame(Xs, columns=df_tmp.columns)
df_t['f6'] = df_4['f6'].values
df_t[month] = df_4[month].values
df_t.head()

In [None]:
# standarize the test dataset as well
df_tmp = df_test.iloc[:, :4]
Xs = scaler.transform(df_tmp)
df_t = pd.DataFrame(Xs, columns=df_tmp.columns)
df_t['f6'] = df_test['f6'].values
df_t[month] = df_test[month].values
df_t.head()

#### Polynomial Regression
The previous analysis conducted above displays the non-linear interaction between f4 and the rest of the feature. Thus a simple Linear Regression will clear underfits.

In [None]:
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=3, shuffle=True, random_state=11)

In [None]:
sk.metrics.get_scorer_names()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# I will try polynomial features with degrees 2, 3, 4, 5
# the choice will be determined using cross validation
X_4_train = df_4.values
y_4_train = yf4.values

polys = [PolynomialFeatures(degree=i) for i in range(2, 6)]
X_trains = [p.fit_transform(X_4_train) for p in polys]    
# intiate a Linear Regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

best_score = 10 ** 9
best_deg = 0
scoring = "neg_mean_squared_error"
for i in range(len(X_trains)): 
    score = -np.mean(cross_val_score(lr, X_trains[i], y_4_train, cv=cv, scoring=scoring))
    
    print(f"degree: {str(i + 2)}" )
    print(f"score: {str(np.mean(score))}")
    if best_score > score:
        best_score = score
        best_deg = i + 2
   
print(best_deg)
print(best_score)

In [None]:
# as the size of the dataset is relatively small, our model is likely to overfit: Lasso and Ridge are more likely to be better options
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
lasso = Lasso()
parameters = {"alpha": [10 ** i for i in range(-3, 3)]}

lasso_search = GridSearchCV(lasso, parameters, cv=cv, scoring=scoring, n_jobs=-1)

lasso_search.fit(X_trains[best_deg - 2], y_4_train)

lasso_est = lasso_search.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error

X_test = polys[best_deg - 2].transform(df_test.values)

f4_pred = lasso_est.predict(X_test)
print(mean_squared_error(y_test, f4_pred))

In [None]:
error_dicts = dict(zip(y_test.tolist(), f4_pred.tolist()))
print(error_dicts)

In [None]:
ridge = Ridge()
parameters = {"alpha": [10 ** i for i in range(-3, 3)]}

ridge_search = GridSearchCV(ridge, parameters, cv=cv, scoring=scoring, n_jobs=-1)

ridge_search.fit(X_trains[best_deg - 2], y_4_train)

ridge_est = ridge_search.best_estimator_ 

ridge_f4_pred = ridge_est.predict(X_test)
print(mean_squared_error(y_test, ridge_f4_pred))

In [None]:
error_dicts = dict(zip(y_test.tolist(), ridge_f4_pred.tolist()))
print(error_dicts)

In [74]:
lasso_linear = Lasso()
parameters = {"alpha": [10 ** i for i in range(-3, 3)]}

lasso_lin_search =  GridSearchCV(lasso_linear, parameters, cv=cv, scoring=scoring, n_jobs=-1)
lasso_lin_search.fit(X_4_train, y_4_train)

lasso_lin_est = lasso_lin_search.best_estimator_
print(lasso_lin_est.coef_)

lasso_lin_pred = lasso_lin_est.predict(df_test.values)

print(mean_squared_error(y_test, lasso_lin_pred))

[ 3.36663468e-02 -4.06884124e-03  9.01891595e-01  3.93239861e-02
 -4.49997720e+00 -3.63241092e-01  2.27033379e+01]
247.90776513014242
