In [1]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("train.csv")

print("Training data:")
print("Shape:", train_df.shape)
print(train_df.head(2))
print('\n')

# Load test data
test_df = pd.read_csv("test.csv")

print("Test data:")
print(test_df.shape)
print(test_df.head(2))

# Dummy initialization of the X_train, X_test and y_train   
X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
y_train = np.zeros_like(train_df['price_CHF'])
X_test = np.zeros_like(test_df)

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


In [12]:
# Pre-processing of the data

# 1. Filter out zero or near-zero variance features.
# 2. Perform imputation if required.
# 3. Normalize to resolve numeric feature skewness.
# 4. Standardize (center and scale) numeric features.
# 5. Perform dimension reduction (e.g., PCA) on numeric features.
# 6. One-hot or dummy encode categorical features.
# 7. VIF test
# 8. Feature selection 

# 1. feature variances
feature_variances = train_df.var()

threshold_variance = 0.3 # this value could be varied depending if we want 
selected_features = feature_variances[feature_variances > threshold_variance].index

train_df_filtered = train_df[selected_features].join(train_df["season"])



  feature_variances = train_df.var()


In [20]:
# 2. Missingness per feature
missing_values_per_season = train_df_filtered.groupby("season").apply(lambda x: (x.isna().sum()))
print("Number of missing values:")
print(missing_values_per_season)
print("\n")

missing_values_per_season_perct = train_df_filtered.groupby("season").apply(lambda x: 100*((x.isna().sum())/len(x)))
print("Percentage of missingness:")
print(missing_values_per_season_perct)

# Conclusion > similar percentage of missing values per seaosn and per country (feature), with very high values (~30%)

Number of missing values:
        price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
season                                                                     
autumn         74         64         72         65         70         65   
spring         70         60         67         65         61         68   
summer         57         74         65         65         68         63   
winter         61         71         64         74         70         68   

        price_UK  price_ITA  price_POL  price_SVK  season  
season                                                     
autumn        62         70         62         71       0  
spring        79         68         70         67       0  
summer        72         71         68         72       0  
winter        74         57         65         71       0  


Percentage of missingness:
        price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
season                                              

In [7]:
train_df["season"].describe()

count        900
unique         4
top       spring
freq         225
Name: season, dtype: object

In [None]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test


In [None]:
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])
    #TODO: Define the model and fit it using training data. Then, use test data to make predictions

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred


In [None]:
# Main function. You don't have to change this
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading()
    # The function retrieving optimal LR parameters
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred) 
    dt.columns = ['price_CHF']
    dt.to_csv('results.csv', index=False)
    print("\nResults file successfully generated!")
