# Generating predictions using our test set

Our main objective is to succesfully map our **parameter matrix** into each row of the test set, and after that inverse scaling it to succesfully create the predictions

---

We need:
- X_test
- Parameter matrix
- Remember originally, we applied a log() function to SalePrice, so we need to apply exp()

In [7]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler

pd.pandas.set_option('display.max_columns', None)

In [8]:
# Load test set

X_test = pd.read_csv('X_test.csv')
X_test.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.0,0.0,0.363636,0.444444,0.819672,0.0,0.5,0.25,0.5,1.0,0.373438,0.349081,0.0,0.333333,0.0,0.2,0.8,0.333333,0.25,1.0,0.75
1,0.0,0.75,0.363636,0.555556,0.868852,1.0,0.5,0.25,0.5,1.0,0.522632,0.488544,0.0,0.666667,0.0,0.2,0.8,0.333333,0.25,1.0,0.75
2,0.235294,0.75,0.590909,0.444444,0.213115,0.0,0.75,0.25,0.75,1.0,0.386718,0.560546,0.0,0.333333,0.333333,0.6,0.8,1.0,0.5,1.0,0.75
3,0.235294,0.75,0.590909,0.555556,0.213115,0.0,0.5,0.25,1.0,1.0,0.385901,0.555075,0.0,0.666667,0.333333,0.8,0.8,1.0,0.5,1.0,0.75
4,0.588235,0.75,0.909091,0.777778,0.311475,0.0,0.75,0.25,1.0,1.0,0.508416,0.475254,0.0,0.666667,0.0,0.2,0.8,0.666667,0.5,1.0,0.75


In [9]:
# Load parameter matrix
with open('pickles/parameter_matrix.pickle', 'rb') as f:
    parameter_matrix = pickle.load(f)

In [10]:
print(f'The training set has shape:     {X_test.shape}')
print(f'The parameter matrix has shape: {parameter_matrix.shape}')

The training set has shape:     (1459, 21)
The parameter matrix has shape: (22, 1)


In [11]:
# Parameter matrix and test set have different shapes, this is because we previously added a row of ones to include the bias.
# Let us add also a column of ones to the test set so it matches with parameter matrix
X_array = X_test.to_numpy()
X = np.column_stack((np.ones(X_array.shape[0]), X_array))
X

array([[1.        , 0.        , 0.        , ..., 0.25      , 1.        ,
        0.75      ],
       [1.        , 0.        , 0.75      , ..., 0.25      , 1.        ,
        0.75      ],
       [1.        , 0.23529412, 0.75      , ..., 0.5       , 1.        ,
        0.75      ],
       ...,
       [1.        , 0.        , 0.75      , ..., 0.5       , 1.        ,
        0.        ],
       [1.        , 0.38235294, 0.75      , ..., 0.        , 1.        ,
        0.75      ],
       [1.        , 0.23529412, 0.75      , ..., 0.75      , 1.        ,
        0.75      ]])

In [12]:
X[0]

array([1.        , 0.        , 0.        , 0.36363636, 0.44444444,
       0.81967213, 0.        , 0.5       , 0.25      , 0.5       ,
       1.        , 0.37343781, 0.34908057, 0.        , 0.33333333,
       0.        , 0.2       , 0.8       , 0.33333333, 0.25      ,
       1.        , 0.75      ])

In [26]:
# With this in mind we can go ahead and generate the predictions
# First position in p_matrix multiplied against first column in test set, plus second position in p_matrix ... , we do this for every row in the test set
# Finally we will return the predictions but inversely scaled

# recibir un dataframe con el contenido del test set y un arreglo de numpy con los pesos o coeficientes!

def generate_predictions(test_set: pd.DataFrame, parameter_matrix: np.ndarray, method: str = 'w_sum'):
    
    if method == 'w_sum':        
        # --- Prepare test-set so it matches paraemter matrix
        X_array = test_set.to_numpy()
        X = np.column_stack((np.ones(X_array.shape[0]), X_array))

        # --- Extract important info and make sure inputs are correctly formatted
        m = X.shape[0]
        n = X.shape[1]
        assert (n == parameter_matrix.shape[0]) and (parameter_matrix.shape[1] == 1)

        # Predictions array
        predictions = []

        # If parameter matrix has more than 1 dimension, the zip function will not work so let us flatten the array
        parameter_matrix = parameter_matrix.flatten()

        # Comencing the iterations for each row
        for _ in range(m):
            sum = 0
            for value, coeff in zip(X[_], parameter_matrix):
                #print(f'The value is: {value} and the coeff is {coeff}')
                sum += value * coeff
            predictions.append(sum)

        # For now, this will return MinMaxScaled predictions
        return np.exp(np.array(predictions))

In [27]:
preds = generate_predictions(X_test, parameter_matrix)
preds

array([ 16674.70268624,  86510.67107569, 171145.16451702, ...,
        27357.68387693,  15467.80567233, 418584.64282849])

In [15]:
preds.size

1459

In [31]:
# Save csv with the predictions
ids = np.array([i for i in range(1461, 1461+preds.size)], dtype=int)

predictions = pd.DataFrame(np.column_stack((ids, preds)), columns=['Id','SalePrice'])
predictions['Id'] = predictions['Id'].astype(int)
predictions.head()

Unnamed: 0,Id,SalePrice
0,1461,16674.702686
1,1462,86510.671076
2,1463,171145.164517
3,1464,289521.199791
4,1465,321570.673012


In [32]:
predictions.to_csv('predictions/pred01.csv', index=False)