In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
import statsmodels.formula.api as smf 

rng = np.random.RandomState(0)

In [3]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index
df.drop(['Betriebssystem_OHNE', 'Betriebssystem_Mac', 'Marke_Dell'], axis=1) # Mac und Marke_Apple sind identische Merkmale

df_noPrice = df.drop('Preis', axis=1)
imputed_simul_knn = pd.DataFrame(columns=['MSE KNN_1', 'MSE KNN_3', 'MSE KNN_5', 'SE KNN_1', 'SE KNN_3', 'SE KNN_5'])
imputed_stats_knn = pd.DataFrame(columns=['MSE KNN_1', 'MSE KNN_3', 'MSE KNN_5', 'SE KNN_1', 'SE KNN_3', 'SE KNN_5'], index = np.arange(0.1, 1, 0.1))

imputed_simul_ols = pd.DataFrame(columns=['MSE OLS', 'SE OLS'])
imputed_stats_ols = pd.DataFrame(columns=['MSE OLS', 'SE OLS'], index = np.arange(0.1, 1, 0.1))


# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)


# Skalierung (Standardisierung) von df mit Preis (Preis ist unverändert)
df_std = df_noPrice.copy()
df_std.insert(0, 'Preis', df['Preis'])

In [4]:
# OLS Model 
# fitting the model 
model = smf.ols(formula='Preis ~ '+' + '.join(df_noPrice.columns), data = df_std).fit() 
print(model.summary()) 

# OLS Model 
# fitting the model 
model = sm.OLS(df['Preis'], sm.add_constant(df_noPrice)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Preis   R-squared:                       0.871
Model:                            OLS   Adj. R-squared:                  0.867
Method:                 Least Squares   F-statistic:                     234.5
Date:                Sat, 13 Jan 2024   Prob (F-statistic):               0.00
Time:                        14:35:50   Log-Likelihood:                -7279.4
No. Observations:                1038   AIC:                         1.462e+04
Df Residuals:                    1008   BIC:                         1.477e+04
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [5]:
train_values, test_values, train_labels, test_labels = train_test_split(df_noPrice, df['Preis'], test_size=0.01)

def del_ran(df_exog, labels, chance):
    rand_array = np.random.rand(df_exog.shape[0])
    delete_entries = rand_array < chance
    keep_entries = rand_array >= chance
    
    return [df_exog[delete_entries], labels[delete_entries], df_exog[keep_entries], labels[keep_entries]]



temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = 0.1)

In [6]:
def impute_ols(test_values, test_labels, train_values, train_labels):
    
    # OLS Model
    # fitting the model 
    model = sm.OLS(train_labels, sm.add_constant(train_values)).fit() 

    imputed_values = model.predict(exog = sm.add_constant(test_values, has_constant='add')).tolist()
    return [np.mean((imputed_values-test_labels)**2), stats.sem(list(train_labels) + imputed_values)]

In [7]:
def impute_knn(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)
       
        # print(train_labels.values[ind])
        # print(current_impute_knn_1//1, current_impute_knn_3//1, current_impute_knn_5//1)
        # print(test_labels.values[index])
        # print(train_values.values[ind])

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)


    return [mse_knn_1, mse_knn_3, mse_knn_5, sem_knn_1, sem_knn_3, sem_knn_5]

In [8]:
def simul_knn():
    for c in np.arange(0.1, 1, 0.1):

        for i in tqdm(range(100)):
            temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = c)
            imputed_simul_knn.at[i] = impute_knn(temp[0], temp[1], temp[2], temp[3])

        imputed_stats_knn.loc[c] = imputed_simul_knn.mean()
    imputed_stats_knn

simul_knn()
    

100%|██████████| 100/100 [00:02<00:00, 35.57it/s]
100%|██████████| 100/100 [00:04<00:00, 21.98it/s]
100%|██████████| 100/100 [00:05<00:00, 16.99it/s]
100%|██████████| 100/100 [00:06<00:00, 15.97it/s]
100%|██████████| 100/100 [00:07<00:00, 13.81it/s]
100%|██████████| 100/100 [00:08<00:00, 11.80it/s]
100%|██████████| 100/100 [00:08<00:00, 11.72it/s]
100%|██████████| 100/100 [00:09<00:00, 10.96it/s]
100%|██████████| 100/100 [00:08<00:00, 11.33it/s]


In [9]:
def simul_ols():
    for c in np.arange(0.1, 1, 0.1):

        for i in tqdm(range(100)):
            temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = c)
            imputed_simul_ols.at[i] = impute_ols(temp[0], temp[1], temp[2], temp[3])
        imputed_stats_ols.loc[c] = imputed_simul_ols.mean()
    imputed_stats_ols

simul_ols()

100%|██████████| 100/100 [00:02<00:00, 33.86it/s]
100%|██████████| 100/100 [00:02<00:00, 34.74it/s]
100%|██████████| 100/100 [00:02<00:00, 35.26it/s]
100%|██████████| 100/100 [00:02<00:00, 37.87it/s]
100%|██████████| 100/100 [00:02<00:00, 42.23it/s]
 26%|██▌       | 26/100 [00:00<00:01, 45.55it/s]


ValueError: shapes (639,33) and (32,) not aligned: 33 (dim 1) != 32 (dim 0)

In [None]:
imputed_stats_ols

Unnamed: 0,MSE OLS,SE OLS
0.1,76687.793571,23.084932
0.2,77768.633792,22.984949
0.3,76696.995701,22.919625
0.4,77520.357061,22.821924
0.5,77959.134433,22.706189
0.6,78600.045349,22.663519
0.7,,
0.8,,
0.9,,
0.7,79991.280381,22.619956


In [None]:
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1)

Unnamed: 0,MSE OLS,SE OLS,MSE KNN_1,MSE KNN_3,MSE KNN_5,SE KNN_1,SE KNN_3,SE KNN_5
0.1,76855.816434,23.081848,91625.104004,75305.078904,81698.579784,23.185788,23.043729,22.973887
0.2,76862.694693,22.990068,96325.017709,80894.14527,87635.252671,23.125654,22.830628,22.690492
0.3,77151.889534,22.89583,103429.261572,87221.621873,93568.368983,23.060363,22.564618,22.353204
0.4,77675.457991,22.792017,109340.039576,92721.295126,97999.602168,22.967241,22.306101,22.038368
0.5,77978.753726,22.731181,117764.242608,98197.082246,102007.148828,22.884522,22.008286,21.662113
0.6,78968.11608,22.640383,128217.373116,107150.425727,109193.344491,22.736263,21.630915,21.224599
0.7,,,143108.482416,119733.303047,119740.797362,22.49601,21.169357,20.644422
0.8,,,165767.734261,135032.548972,137625.072864,22.185904,20.549637,19.776381
0.9,,,215232.05296,176953.371162,183896.674107,21.889872,19.176736,17.563163
