# Header...

In [1]:
import itertools as itt
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor


## Load data

In [2]:
# Loading datatset (a) to predict Yield
a_X_train = pd.read_csv('a_X_train.csv')
a_y_train = pd.read_csv('a_y_train.csv')
a_X_test = pd.read_csv('a_X_test.csv')
a_y_test = pd.read_csv('a_y_test.csv')

## LinearRegression

In [31]:
features = list(a_X_train.columns)

def get_combinations(l):
    c = []
    for r in range(1,len(l)+1):
        c.extend(list(itt.combinations(l,r)))
    return c
def get_best_runs(evaluations):
    best_runs_by_r2 = []
    best_runs_by_rmse = []
    for run in evaluations:
        if run==0:
            best_runs_by_r2.append(evaluations[run])
            best_runs_by_rmse.append(evaluations[run])
        else:
            current_run = evaluations[run]
            best_run_by_r2 = best_runs_by_r2[-1]
            best_run_by_rmse = best_runs_by_rmse[-1]
            if current_run['evaluation']['r2'] > best_run_by_r2['evaluation']['r2']:
                best_runs_by_r2=[current_run]
            elif current_run['evaluation']['r2'] == best_run_by_r2['evaluation']['r2']:
                best_runs_by_r2.append(current_run)
            if current_run['evaluation']['rmse'] < best_run_by_rmse['evaluation']['rmse']:
                best_runs_by_rmse=[current_run]
            elif current_run['evaluation']['rmse'] == best_run_by_rmse['evaluation']['rmse']:
                best_runs_by_rmse.append(current_run)
    return best_runs_by_r2,best_runs_by_rmse

In [39]:
def runLinearRegression(feature, a_X_train, a_X_test, a_y_train, a_y_test):
    X_train = a_X_train[feature]
    model = LinearRegression()
    model.fit(X_train, a_y_train)
    X_test = a_X_test[feature]
    y_pred = model.predict(X_test)

    mse = mean_squared_error(list(a_y_test['Yield']), y_pred.squeeze().tolist())
    rmse = mean_squared_error(a_y_test, y_pred, squared=False)
    r2 = r2_score(a_y_test, y_pred)

    return {'mse':mse,'rmse':rmse,'r2':r2, 'y_pred':y_pred.squeeze().tolist()}

def run():
    feature_combinations = get_combinations(features)
    run_number = 0
    evaluations = {}
    for feature in tqdm(feature_combinations, desc='Running:'):
        evaluation = runLinearRegression(list(feature), a_X_train, a_X_test, a_y_train, a_y_test)
        evaluations[run_number] = {'features':list(feature), 'evaluation':evaluation}
        run_number+=1
    return evaluations

evaluations = run()
best_runs_by_r2,best_runs_by_rmse = get_best_runs(evaluations)
# best_runs_by_r2,best_runs_by_rmse

Running:: 100%|██████████| 2047/2047 [00:07<00:00, 275.24it/s]


## KNN

In [47]:
def runKNeighborsRegressor(feature, a_X_train, a_X_test, a_y_train, a_y_test, k=5, weights='uniform',metric='euclidean'):
    X_train = a_X_train[feature]
    model = KNeighborsRegressor(n_neighbors=k, weights=weights, metric=metric)
    model.fit(X_train, a_y_train)
    X_test = a_X_test[feature]

    y_pred = model.predict(X_test)
    rmse = mean_squared_error(a_y_test, y_pred, squared=False)
    mse = mean_squared_error(a_y_test, y_pred)
    r2 = r2_score(a_y_test, y_pred)

    return {'mse':mse,'rmse':rmse,'r2':r2, 'y_pred':y_pred.squeeze().tolist()}

def run():
    feature_combinations = get_combinations(features)
    list_of_ks=range(1,11)
    list_of_weights = ['uniform','distance']
    list_of_metrics = ['euclidean','manhattan','chebyshev','cosine']
    run_number = 0
    evaluations = {}
    for metric in tqdm(list_of_metrics,desc='Running:'):
        for weights in list_of_weights:
            for k in list_of_ks:
                # print(f"Running for k={k}, weights = {weights}, metric={metric}")
                for feature in feature_combinations:
                    evaluation = runKNeighborsRegressor(list(feature), a_X_train, a_X_test, a_y_train, a_y_test, k=k, weights=weights)
                    evaluations[run_number] = {'k':k,'metric':metric,'weights':weights,'features':list(feature),'evaluation':evaluation}
                    run_number+=1
    return evaluations

evaluations_knn = run()
best_runs_by_r2_knn,best_runs_by_rmse_knn = get_best_runs(evaluations_knn)
# best_runs_by_r2_knn,best_runs_by_rmse_knn

Running for k=1, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.10it/s]


Running for k=2, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.18it/s]


Running for k=3, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 248.92it/s]


Running for k=4, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.38it/s]


Running for k=5, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.31it/s]


Running for k=6, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.00it/s]


Running for k=7, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.74it/s]


Running for k=8, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.08it/s]


Running for k=9, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 248.47it/s]


Running for k=10, weights = uniform, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.63it/s]


Running for k=1, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.23it/s]


Running for k=2, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.65it/s]


Running for k=3, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.64it/s]


Running for k=4, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.92it/s]


Running for k=5, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.33it/s]


Running for k=6, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.60it/s]


Running for k=7, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.74it/s]


Running for k=8, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.00it/s]


Running for k=9, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.50it/s]


Running for k=10, weights = distance, metric=euclidean


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.58it/s]


Running for k=1, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.64it/s]


Running for k=2, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 238.18it/s]


Running for k=3, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.58it/s]


Running for k=4, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 241.38it/s]


Running for k=5, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.45it/s]


Running for k=6, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 248.07it/s]


Running for k=7, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.38it/s]


Running for k=8, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.27it/s]


Running for k=9, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 244.93it/s]


Running for k=10, weights = uniform, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 245.49it/s]


Running for k=1, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 248.91it/s]


Running for k=2, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.22it/s]


Running for k=3, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.10it/s]


Running for k=4, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.00it/s]


Running for k=5, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.53it/s]


Running for k=6, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 245.83it/s]


Running for k=7, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 244.88it/s]


Running for k=8, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 248.63it/s]


Running for k=9, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.77it/s]


Running for k=10, weights = distance, metric=manhattan


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.47it/s]


Running for k=1, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.16it/s]


Running for k=2, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 250.05it/s]


Running for k=3, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 249.54it/s]


Running for k=4, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.20it/s]


Running for k=5, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:07<00:00, 256.51it/s]


Running for k=6, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 251.62it/s]


Running for k=7, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.34it/s]


Running for k=8, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.46it/s]


Running for k=9, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 247.19it/s]


Running for k=10, weights = uniform, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 238.84it/s]


Running for k=1, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 239.51it/s]


Running for k=2, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 233.77it/s]


Running for k=3, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 230.80it/s]


Running for k=4, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 229.28it/s]


Running for k=5, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.41it/s]


Running for k=6, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 238.07it/s]


Running for k=7, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:09<00:00, 226.74it/s]


Running for k=8, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 234.94it/s]


Running for k=9, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 239.37it/s]


Running for k=10, weights = distance, metric=chebyshev


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 233.17it/s]


Running for k=1, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.42it/s]


Running for k=2, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.45it/s]


Running for k=3, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.00it/s]


Running for k=4, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.89it/s]


Running for k=5, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.51it/s]


Running for k=6, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 244.59it/s]


Running for k=7, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 245.05it/s]


Running for k=8, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.02it/s]


Running for k=9, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.65it/s]


Running for k=10, weights = uniform, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.35it/s]


Running for k=1, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 246.30it/s]


Running for k=2, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.86it/s]


Running for k=3, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 241.21it/s]


Running for k=4, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 243.87it/s]


Running for k=5, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 245.30it/s]


Running for k=6, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.97it/s]


Running for k=7, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 242.79it/s]


Running for k=8, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 241.92it/s]


Running for k=9, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 244.25it/s]


Running for k=10, weights = distance, metric=cosine


Running:: 100%|██████████| 2047/2047 [00:08<00:00, 239.20it/s]


In [46]:
best_runs_by_r2_knn

[{'k': 2,
  'metric': 'euclidean',
  'weights': 'distance',
  'features': ['Location', 'Avg Min Temp *C', 'Cold Wave', 'Crop Type'],
  'evaluation': {'mse': 9.3227810358089,
   'rmse': 3.0533229498054903,
   'r2': 0.999893349725989,
   'y_pred': [974.6466666666666,
    973.4700000000001,
    1652.3933333333332,
    1474.935,
    1460.7150000000001,
    787.73,
    1040.311666666667,
    1697.1333333333334,
    1338.7779999999998,
    1182.8714285714286,
    1466.695,
    781.64,
    1196.1433333333332,
    1388.8314285714284,
    1066.8433333333332,
    1188.94,
    1474.9350000000002,
    782.508,
    1488.98,
    1657.31,
    1474.935,
    1617.6449999999998,
    1761.2333333333333,
    1403.78]}},
 {'k': 2,
  'metric': 'euclidean',
  'weights': 'distance',
  'features': ['Avg Min Temp *C', 'Cold Wave', 'Irrigation', 'Crop Type'],
  'evaluation': {'mse': 9.3227810358089,
   'rmse': 3.0533229498054903,
   'r2': 0.999893349725989,
   'y_pred': [974.6466666666666,
    973.4700000000001,