In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.linear_model import LinearRegression
random.seed(0)

In [2]:
mcar_dfs = {}
mar_dfs = {}
mnar_dfs = {}

probs = [10, 30, 50]

for p in probs:
    mcar_dfs[f'mcar_df_{p}'] = pd.read_csv(f'./missing_data/fifa23_MCAR_{p}.csv')
    mar_dfs[f'mar_df_{p}'] = pd.read_csv(f'./missing_data/fifa23_MAR_{p}.csv')
    mnar_dfs[f'mnar_df_{p}'] = pd.read_csv(f'./missing_data/fifa23_MNAR_{p}.csv')

## **MEAN IMPUTATION**

In [3]:
mean = SimpleImputer(missing_values=np.nan, strategy='mean')

for p in probs:
    for t, dfs in [("MCAR", mcar_dfs), ("MAR", mar_dfs), ("MNAR", mnar_dfs)]:
        key = f'{t.lower()}_df_{p}'
        mean.fit(dfs[key])
        dfs[f'{key}_mean_imputed'] = mean.transform(dfs[key])
        dfs[f'{key}_mean_imputed'] = pd.DataFrame(dfs[f'{key}_mean_imputed'], columns=dfs[key].columns)

In [4]:
mnar_dfs['mnar_df_50_mean_imputed']

Unnamed: 0,Overall,Potential,Age,Height(in cm),Weight(in kg),TotalStats,BaseStats,Preferred Foot,Weak Foot Rating,Skill Moves,...,LB Rating,CB Rating,RB Rating,GK Rating,Attacking Work Rate_High,Attacking Work Rate_Low,Attacking Work Rate_Medium,Defensive Work Rate_High,Defensive Work Rate_Low,Defensive Work Rate_Medium
0,91.0,91.000000,25.183371,169.00000,75.228823,1598.477455,452.000000,0.000000,2.936927,4.000000,...,55.686812,54.62239,62.000000,23.229999,0.29055,0.045858,0.000000,0.176981,0.083144,0.000000
1,91.0,91.000000,25.183371,185.00000,81.000000,2147.000000,357.200366,1.000000,2.936927,2.364132,...,63.000000,54.62239,63.000000,23.229999,0.29055,0.045858,1.000000,0.000000,0.083144,0.731953
2,91.0,70.869278,25.183371,185.00000,81.000000,2205.000000,357.200366,0.754143,4.000000,2.364132,...,55.686812,54.62239,55.610816,23.229999,0.29055,0.000000,0.667349,0.000000,0.083144,1.000000
3,91.0,91.000000,31.000000,181.00000,75.228823,1598.477455,483.000000,0.754143,2.936927,4.000000,...,78.000000,54.62239,78.000000,24.000000,0.29055,0.045858,0.667349,0.176981,0.083144,0.000000
4,91.0,70.869278,23.000000,181.57522,75.228823,1598.477455,357.200366,0.754143,2.936927,5.000000,...,55.686812,54.62239,55.610816,21.000000,1.00000,0.000000,0.000000,0.000000,0.083144,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18415,47.0,56.000000,21.000000,174.00000,75.228823,1287.000000,357.200366,0.754143,3.000000,2.364132,...,55.686812,54.62239,55.610816,23.229999,0.00000,0.000000,0.667349,0.000000,0.083144,1.000000
18416,47.0,70.869278,17.000000,181.57522,75.228823,1598.477455,267.000000,0.754143,2.936927,2.000000,...,55.686812,49.00000,49.000000,15.000000,0.00000,0.000000,1.000000,0.176981,0.083144,0.731953
18417,47.0,70.869278,18.000000,170.00000,65.000000,1598.477455,357.200366,0.754143,3.000000,2.364132,...,46.000000,54.62239,46.000000,17.000000,1.00000,0.000000,0.000000,0.176981,0.083144,1.000000
18418,47.0,70.869278,17.000000,178.00000,75.228823,1113.000000,357.200366,1.000000,2.936927,2.364132,...,55.686812,49.00000,55.610816,15.000000,0.29055,0.000000,1.000000,0.176981,0.000000,1.000000


## **MOST FREQUENT IMPUTATION**

In [5]:
mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

for p in probs:
    for t, dfs in [("MCAR", mcar_dfs), ("MAR", mar_dfs), ("MNAR", mnar_dfs)]:
        key = f'{t.lower()}_df_{p}'
        mode.fit(dfs[key])
        dfs[f'{key}_mode_imputed'] = mode.transform(dfs[key])
        dfs[f'{key}_mode_imputed'] = pd.DataFrame(dfs[f'{key}_mode_imputed'], columns=dfs[key].columns)

In [6]:
mnar_dfs['mnar_df_30_mode_imputed']

Unnamed: 0,Overall,Potential,Age,Height(in cm),Weight(in kg),TotalStats,BaseStats,Preferred Foot,Weak Foot Rating,Skill Moves,...,LB Rating,CB Rating,RB Rating,GK Rating,Attacking Work Rate_High,Attacking Work Rate_Low,Attacking Work Rate_Medium,Defensive Work Rate_High,Defensive Work Rate_Low,Defensive Work Rate_Medium
0,91.0,70.0,35.0,180.0,67.0,1688.0,452.0,0.0,4.0,4.0,...,64.0,53.0,64.0,22.0,0.0,0.0,1.0,0.0,1.0,0.0
1,91.0,91.0,34.0,185.0,81.0,2147.0,359.0,1.0,4.0,4.0,...,64.0,65.0,63.0,21.0,0.0,0.0,1.0,0.0,0.0,1.0
2,91.0,91.0,33.0,185.0,81.0,1688.0,458.0,1.0,4.0,4.0,...,64.0,65.0,64.0,18.0,1.0,0.0,0.0,0.0,0.0,1.0
3,91.0,91.0,22.0,181.0,70.0,2303.0,483.0,1.0,5.0,4.0,...,64.0,65.0,78.0,18.0,1.0,0.0,0.0,1.0,0.0,1.0
4,91.0,95.0,23.0,182.0,73.0,2177.0,470.0,1.0,4.0,5.0,...,64.0,57.0,66.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18415,47.0,56.0,21.0,174.0,68.0,1287.0,359.0,1.0,3.0,2.0,...,64.0,36.0,64.0,15.0,0.0,0.0,1.0,0.0,0.0,1.0
18416,47.0,57.0,17.0,175.0,75.0,1289.0,267.0,1.0,3.0,2.0,...,49.0,49.0,49.0,18.0,0.0,0.0,1.0,0.0,0.0,1.0
18417,47.0,67.0,18.0,170.0,75.0,1333.0,277.0,1.0,3.0,2.0,...,46.0,42.0,46.0,17.0,1.0,0.0,0.0,0.0,0.0,1.0
18418,47.0,61.0,22.0,178.0,65.0,1113.0,226.0,1.0,3.0,2.0,...,47.0,65.0,47.0,15.0,0.0,0.0,1.0,0.0,0.0,1.0


## **KNN IMPUTATION**

In [7]:
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000)

for p in probs:
    for t, dfs in [("MCAR", mcar_dfs), ("MAR", mar_dfs), ("MNAR", mnar_dfs)]:
        key = f'{t.lower()}_df_{p}'
        knn = fast_knn(dfs[key].values, k=5)
        dfs[f'{key}_knn_imputed'] = pd.DataFrame(knn, columns=dfs[key].columns)

In [8]:
mar_dfs['mar_df_30_knn_imputed']

Unnamed: 0,Overall,Potential,Age,Height(in cm),Weight(in kg),TotalStats,BaseStats,Preferred Foot,Weak Foot Rating,Skill Moves,...,LB Rating,CB Rating,RB Rating,GK Rating,Attacking Work Rate_High,Attacking Work Rate_Low,Attacking Work Rate_Medium,Defensive Work Rate_High,Defensive Work Rate_Low,Defensive Work Rate_Medium
0,91.0,77.605307,35.000000,179.488727,67.000000,1582.839051,452.000000,0.000000,2.982798,4.000000,...,62.000000,54.315016,66.064036,22.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
1,91.0,91.000000,34.000000,185.000000,76.212815,1582.839051,455.000000,1.000000,4.000000,4.000000,...,58.822733,59.710642,63.000000,21.000000,0.0,0.0,0.392530,0.0,0.0,0.449971
2,91.0,91.000000,33.000000,185.000000,81.000000,2205.000000,458.000000,1.000000,4.000000,4.000000,...,64.023584,70.772079,73.473065,22.715938,0.0,0.0,0.000000,0.0,0.0,0.722419
3,91.0,73.109959,24.987346,181.000000,71.567515,1582.839051,356.151402,0.660215,3.001626,2.653877,...,78.000000,58.900753,78.000000,24.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
4,91.0,95.000000,23.000000,182.000000,73.000000,2177.000000,470.000000,0.741373,4.000000,5.000000,...,66.000000,57.000000,70.578161,21.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18415,47.0,56.000000,22.464425,174.000000,68.000000,1287.000000,274.000000,1.000000,3.000000,2.000000,...,40.000000,36.000000,40.000000,15.000000,0.0,0.0,0.898508,0.0,0.0,1.000000
18416,47.0,57.000000,17.000000,175.000000,60.000000,1289.000000,356.151402,1.000000,2.772474,2.000000,...,49.000000,45.809831,49.000000,15.000000,0.0,0.0,0.601510,0.0,0.0,0.767626
18417,47.0,67.000000,18.000000,170.000000,69.917752,1333.000000,277.000000,1.000000,3.000000,2.000000,...,46.000000,42.000000,46.000000,17.000000,0.0,0.0,0.000000,0.0,0.0,1.000000
18418,47.0,61.000000,17.000000,178.000000,65.000000,1113.000000,356.151402,1.000000,3.000000,2.000000,...,50.486097,53.503921,47.000000,15.000000,0.0,0.0,1.000000,0.0,0.0,1.000000


## **MICE IMPUTATION**

In [9]:
mice = IterativeImputer(estimator=LinearRegression(), random_state=0)

for p in probs:
    for t, dfs in [("MCAR", mcar_dfs), ("MAR", mar_dfs), ("MNAR", mnar_dfs)]:
        key = f'{t.lower()}_df_{p}'
        dfs[f'{key}_mice_imputed'] = mice.fit_transform(dfs[key])
        dfs[f'{key}_mice_imputed'] = pd.DataFrame(dfs[f'{key}_mice_imputed'], columns=dfs[key].columns)



In [10]:
mar_dfs['mar_df_30_mice_imputed']

Unnamed: 0,Overall,Potential,Age,Height(in cm),Weight(in kg),TotalStats,BaseStats,Preferred Foot,Weak Foot Rating,Skill Moves,...,LB Rating,CB Rating,RB Rating,GK Rating,Attacking Work Rate_High,Attacking Work Rate_Low,Attacking Work Rate_Medium,Defensive Work Rate_High,Defensive Work Rate_Low,Defensive Work Rate_Medium
0,91.0,84.560553,35.000000,170.01185,67.000000,2161.235027,452.000000,0.000000,3.467898,4.000000,...,62.000000,51.149516,62.090909,22.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
1,91.0,91.000000,34.000000,185.00000,79.931219,2107.202229,455.000000,1.000000,4.000000,4.000000,...,63.033774,56.193980,63.000000,21.000000,0.0,0.0,0.238191,0.0,0.0,0.214454
2,91.0,91.000000,33.000000,185.00000,81.000000,2205.000000,458.000000,1.000000,4.000000,4.000000,...,65.014722,63.465027,65.083250,20.963077,0.0,0.0,0.000000,0.0,0.0,-0.025646
3,91.0,83.785956,34.161952,181.00000,75.476546,2272.287895,459.648341,0.448708,3.314558,4.046046,...,78.000000,71.518041,78.000000,24.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
4,91.0,95.000000,23.000000,182.00000,73.000000,2177.000000,470.000000,0.781370,4.000000,5.000000,...,66.000000,57.000000,66.019476,21.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18415,47.0,56.000000,22.480610,174.00000,68.000000,1287.000000,274.000000,1.000000,3.000000,2.000000,...,40.000000,36.000000,40.000000,15.000000,0.0,0.0,1.399641,0.0,0.0,1.000000
18416,47.0,57.000000,17.000000,175.00000,60.000000,1289.000000,272.304132,1.000000,2.735874,2.000000,...,49.000000,48.655365,49.000000,15.000000,0.0,0.0,2.640775,0.0,0.0,1.667369
18417,47.0,67.000000,18.000000,170.00000,63.146468,1333.000000,277.000000,1.000000,3.000000,2.000000,...,46.000000,42.000000,46.000000,17.000000,0.0,0.0,0.000000,0.0,0.0,1.000000
18418,47.0,61.000000,17.000000,178.00000,65.000000,1113.000000,225.942938,1.000000,3.000000,1.628411,...,47.009601,48.553336,47.000000,15.000000,0.0,0.0,1.000000,0.0,0.0,1.000000


## **GENERATE CSV**

In [11]:
imputations = ["MEAN", "MODE", "KNN", "MICE"]

for p in probs:
    for t, dfs in [("MCAR", mcar_dfs), ("MAR", mar_dfs), ("MNAR", mnar_dfs)]:
        for i in imputations:
            key = f'{t.lower()}_df_{p}_{i.lower()}_imputed'
            dfs[key].to_csv(f'./imputed_data/{key}.csv', index=False)
            print(key)

mcar_df_10_mean_imputed
mcar_df_10_mode_imputed
mcar_df_10_knn_imputed
mcar_df_10_mice_imputed
mar_df_10_mean_imputed
mar_df_10_mode_imputed
mar_df_10_knn_imputed
mar_df_10_mice_imputed
mnar_df_10_mean_imputed
mnar_df_10_mode_imputed
mnar_df_10_knn_imputed
mnar_df_10_mice_imputed
mcar_df_30_mean_imputed
mcar_df_30_mode_imputed
mcar_df_30_knn_imputed
mcar_df_30_mice_imputed
mar_df_30_mean_imputed
mar_df_30_mode_imputed
mar_df_30_knn_imputed
mar_df_30_mice_imputed
mnar_df_30_mean_imputed
mnar_df_30_mode_imputed
mnar_df_30_knn_imputed
mnar_df_30_mice_imputed
mcar_df_50_mean_imputed
mcar_df_50_mode_imputed
mcar_df_50_knn_imputed
mcar_df_50_mice_imputed
mar_df_50_mean_imputed
mar_df_50_mode_imputed
mar_df_50_knn_imputed
mar_df_50_mice_imputed
mnar_df_50_mean_imputed
mnar_df_50_mode_imputed
mnar_df_50_knn_imputed
mnar_df_50_mice_imputed
