In [3]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('train.csv')

In [3]:
X=df.drop(['yield', 'id', 'Row#'], axis=1)

In [4]:
y=df['yield']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [6]:
from sklearn.preprocessing import PolynomialFeatures

In [7]:
class HigherOrderRegressor():
    def __init__(self, degree):
        self.degree = degree
        self.X = None
        self.Y = None
        self.B = None
        self.num_features = None

    def fit(self, X, Y):
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        if len(Y.shape) == 1:
            Y = Y.reshape(-1, 1)
        self.num_features=X.shape[1]
        indices=np.random.shuffle(np.arange(X.shape[0]))
        temp_X = X[indices][0]
        temp_Y = Y[indices][0]
        # np.hstack((np.ones((self.X.shape[0], 1)), self.X))
        self.orig_X = np.hstack((np.ones((X.shape[0], 1)), temp_X))
        self.Y = temp_Y
        self.X = self.recursive_generate_powers(0, self.degree, np.ones(X.shape[0])).T
        try:
            self.B = np.matmul(np.linalg.pinv(np.matmul(self.X.T, self.X)),
                                np.matmul(self.X.T, self.Y))
        except:
            self.B = np.matmul(np.linalg.pinv(np.matmul(self.X.T, self.X)),
                                np.matmul(self.X.T, self.Y))

    def recursive_generate_powers(self, max_feature_encountered, current_degree, current_product, use_orig=True):
        if current_degree == 0:
            return current_product
        result=[]
        for i in range(max_feature_encountered, self.num_features+1):
            if use_orig:
                result.append(self.recursive_generate_powers(i, current_degree-1,
                                                            current_product*self.orig_X[:,i]))
            else:
                result.append(self.recursive_generate_powers(i, current_degree-1,
                                                            current_product*self.test_X[:,i], False))
        result=np.vstack(result)
        return result
    
    def predict(self, X):
        if X.ndim == 1:
            X = X.reshape(-1,1)
        self.test_X=np.hstack((np.ones((X.shape[0], 1)), X))
        self.final_testX=self.recursive_generate_powers(0, self.degree, np.ones(X.shape[0]), False).T
        return np.matmul(self.final_testX, self.B)

    def sum_of_squares(self, Y_true, Y_pred):
        return np.sum((Y_true.reshape(-1) - Y_pred.reshape(-1))**2)
    
    def mean_absolute_error(self, Y_true, Y_pred):
        return np.mean(np.abs(Y_true.reshape(-1) - Y_pred.reshape(-1))) 
        
    def r2_score(self, Y_true, Y_pred):
        return 1 - np.sum((Y_true.reshape(-1) - Y_pred.reshape(-1))**2) / np.sum((Y_true.reshape(-1) - np.mean(Y_true.reshape(-1)))**2)

In [8]:
reg=HigherOrderRegressor(1)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train=X_train.values
X_test=X_test.values
y_train=y_train.values
y_test=y_test.values
mask = np.array(list(np.binary_repr(119, width=16)), dtype=int).astype(bool)
X_train=X_train[:,mask]
# new_mask=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209]
X_train=PolynomialFeatures(degree=4).fit_transform(X_train)
new_mask=[i for i in range(len(X_train[0]))]
X_test=X_test[:,mask]
X_test=PolynomialFeatures(degree=4).fit_transform(X_test)
reg.fit(X_train[:,new_mask], y_train)
print(reg.mean_absolute_error(y_test, reg.predict(X_test[:,new_mask])))


261.28355604628484


In [19]:
test_data=pd.read_csv('test.csv')
X_test_final=test_data.drop(['id', 'Row#'], axis=1).values[:,mask]
X_train_final_2=PolynomialFeatures(degree=4).fit_transform(X.values[:,mask])
X_test_final_2=PolynomialFeatures(degree=4).fit_transform(X_test_final)
# new_mask=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209]
new_mask=[i for i in range(len(X_train[0]))]
reg.fit(X_train_final_2[:,new_mask], y.values)
# print(X_train_final_2[:,new_mask].shape, X_test_final_2[:,new_mask].shape)    
X_test_final_2=X_test_final_2[:,new_mask]
y_pred=reg.predict(X_test_final_2)
print(mean_absolute_error(pd.read_csv('output.csv')['yield'], y_pred))
test_data['yield']=y_pred
test_data[['id', 'yield']].to_csv('hello.csv', index=False)

4.022240318590775e-14


In [1]:
class NadarayaKernel:

    def __init__(self, bandwidth = 1):
        self.data = None
        self.yi = None
        self.bandwidth = bandwidth

    def fit(self, data):
        self.data = np.array(data.drop(['yield'], axis=1))
        self.yi = np.array(data['yield'])

    def kernel(self, x, xi):
        d = x.shape[0]
        # print(x.shape, xi.shape)
        # print((x - xi).shape, self.bandwidth.shape)
        # print(np.matmul((x - xi), self.bandwidth))
        diff = x-xi
        norm = np.linalg.norm(diff)
        argument = norm / self.bandwidth
        return (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * argument**2)

    def evaluate(self, x):
        sum1 = 0
        sum2 = 0
        for i, xi in enumerate(self.data):
            kernel = self.kernel(x, xi)
            sum1 += kernel * self.yi[i]
            sum2 += kernel
        return 0 if sum2 == 0 else sum1 / sum2

    def predict(self, test):
        test = np.array(test)
        test_expanded = test[:, np.newaxis, :]
        data_expanded = self.data[np.newaxis, :, :]

        # print(test_expanded.shape, data_expanded.shape)

        diff = (test_expanded - data_expanded)/self.bandwidth
        nor = np.sum(diff**2, axis=2)
        kernel_values = (1 / np.sqrt(2 * np.pi))**16 * np.exp(-0.5 * (nor / self.bandwidth)**2)
        # kernel_values = np.where(nor >= 1, 0, 3 * (1 - nor) / (4 * self.bandwidth))
        # nor = np.where(nor >= 1, 1, nor)
        # kernel_values = 3*40320*(1-nor)/(np.pi**8)
        kernel_values_sum = np.sum(kernel_values, axis=1)

        kernel_values_sum = np.where(kernel_values_sum == 0, 1, kernel_values_sum)

        # print(kernel_values.shape, kernel_values_sum.shape)
        # print(kernel_values_sum)
        return np.sum(kernel_values * self.yi, axis=1) / kernel_values_sum

In [4]:
data = pd.read_csv('train.csv')
data.drop(['id', 'Row#'], axis=1, inplace=True)
data

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,25.0,0.50,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.402948,0.409261,31.274591,4418.44126
1,12.5,0.25,0.25,0.25,0.25,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.500438,0.445494,34.467567,5862.80545
2,25.0,0.50,0.25,0.38,0.75,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.509001,0.459421,36.624966,6079.08526
3,12.5,0.25,0.25,0.75,0.75,94.6,57.2,79.0,68.2,33.0,55.9,16.0,0.26,0.583379,0.498056,40.865478,7400.77538
4,12.5,0.25,0.25,0.50,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.447669,0.423764,33.298861,4858.24073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,12.5,0.25,0.25,0.50,0.63,69.7,42.1,58.2,50.2,24.3,41.2,34.0,0.56,0.335927,0.352186,28.793440,3182.69865
14996,12.5,0.25,0.25,0.38,0.63,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.545095,0.470719,38.042756,5862.80545
14997,12.5,0.25,0.25,0.75,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.481801,0.442535,35.414082,5569.13925
14998,25.0,0.50,0.38,0.38,0.75,86.0,52.0,71.9,62.0,30.0,50.8,1.0,0.10,0.552359,0.470847,38.625436,6795.88187


In [6]:
train_x = data.drop(['yield'], axis=1)
train_y = data['yield']
train = data[:int(0.8*len(data))]
test = data[int(0.8*len(data)):]
test_answers = test['yield']
test = test.drop(['yield'], axis=1)

In [7]:
kernel_now = NadarayaKernel(bandwidth=1)
kernel_now.fit(train)
predict_train = kernel_now.predict(test)
np.sum(np.abs(predict_train - test_answers)) / len(test_answers)

338.7243371050752

In [None]:
# cross = []
# x = []
# for i in np.linspace(1, 3, 10):
#     kernel_now = NadarayaKernel(bandwidth=i)
#     kernel_now.fit(train)
#     predict_train = kernel_now.predict(test)
#     alpha = np.sum(np.abs(predict_train - test_answers)) / len(test_answers)
#     print('------------', alpha)
#     cross.append(alpha)
#     x.append(i)

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(x, cross)
# plt.show()