In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Reload the original dataset
df = pd.read_csv('50_Startups.csv')

# Apply One-Hot Encoding to the 'State' column
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
transformed_state = ohe.fit_transform(df[['State']])
transformed_state_df = pd.DataFrame(transformed_state, columns=ohe.get_feature_names_out(['State']), index=df.index)

# Concatenate the one-hot encoded columns and drop the original 'State' column
df = pd.concat([df, transformed_state_df], axis=1)
df = df.drop('State', axis=1)

print("DataFrame after re-loading and One-Hot Encoding:")
display(df.head())
print("Columns in DataFrame:", df.columns.tolist())

DataFrame after re-loading and One-Hot Encoding:


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


Columns in DataFrame: ['R&D Spend', 'Administration', 'Marketing Spend', 'Profit', 'State_Florida', 'State_New York']


In [None]:
cols = df.columns[[0, 1, 2, 4, 5]]
df = df.loc[:, cols]


In [None]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0
5,131876.9,99814.71,362861.36,0,1
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,1,0
8,120542.52,148718.95,311613.29,0,1
9,123334.88,108679.17,304981.62,0,0


In [None]:
import numpy as np
import pandas as pd


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State_Florida    50 non-null     int32  
 4   State_New York   50 non-null     int32  
dtypes: float64(3), int32(2)
memory usage: 1.7 KB


In [None]:
df.isnull().mean()

Unnamed: 0,0
R&D Spend,0.0
Administration,0.0
Marketing Spend,0.0
State_Florida,0.0
State_New York,0.0


In [None]:
def insert_nan_by_percentage(df, col_percent_map, random_state=2):
    df = df.copy()
    rng = np.random.default_rng(random_state)

    for col, pct in col_percent_map.items():
        # convert existing 0s to NaN
        df.loc[df[col] == 0, col] = np.nan

        n = len(df)
        n_nan = max(1, int(n * pct))  # ensure at least 1 NaN

        indices = rng.choice(df.index, size=n_nan, replace=False)
        df.loc[indices, col] = np.nan

    return df


In [None]:
col_nan_percent = {
    'Administration': 0.04,  # 3%
    'Marketing Spend': 0.05,  # 5%
    'R&D Spend': 0.06
}

df1 = insert_nan_by_percentage(df, col_nan_percent)


In [None]:
def insert_nan(df, col_percent_map, random_state=42):
    df = df.copy()
    rng = np.random.default_rng(random_state)

    for col, pct in col_percent_map.items():
        n = len(df)
        n_nan = int(n * pct)

        indices = rng.choice(df.index, size=n_nan, replace=False)
        df.loc[indices, col] = np.nan

    return df


In [None]:
col_na = {
    'State_New York': 0.04,  # 3%
    'State_Florida': 0.05,  # 5%
}

df2 = insert_nan(df, col_na)

In [None]:
df1.isnull().mean()

Unnamed: 0,0
R&D Spend,0.1
Administration,0.04
Marketing Spend,0.1
State_Florida,0.0
State_New York,0.0


In [None]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,,91391.77,366168.42,1,0
5,131876.9,99814.71,362861.36,0,1
6,134615.46,147198.87,127716.82,0,0
7,130298.13,145530.06,323876.68,1,0
8,120542.52,148718.95,311613.29,0,1
9,123334.88,108679.17,304981.62,0,0


In [None]:
df2.isnull().mean()

Unnamed: 0,0
R&D Spend,0.0
Administration,0.0
Marketing Spend,0.0
State_Florida,0.04
State_New York,0.04


In [None]:
df3 = df1.copy()

for col in df1.columns:
    mask = df2[col].isna()
    df3.loc[mask, col] = np.nan


In [None]:
df3.isnull().mean()

Unnamed: 0,0
R&D Spend,0.1
Administration,0.04
Marketing Spend,0.1
State_Florida,0.04
State_New York,0.04


In [None]:
df3 = df3.iloc[:, :-2]

In [None]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df3['R&D Spend'].fillna(df3['R&D Spend'].mean())
df0['Administration'] = df3['Administration'].fillna(df3['Administration'].mean())
df0['Marketing Spend'] = df3['Marketing Spend'].fillna(df3['Marketing Spend'].mean())
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,75279.352444,91391.77,366168.42
5,131876.9,99814.71,362861.36
6,134615.46,147198.87,127716.82
7,130298.13,145530.06,323876.68
8,120542.52,148718.95,311613.29
9,123334.88,108679.17,304981.62


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# ---------------------------------------------------------
# df3 : original data with NaNs in 5 columns
# df0 : same data, NaNs already filled with mean (initial guess)
# ---------------------------------------------------------

cols_to_impute = df3.columns.tolist()

# working copy
df_imp = df0.copy()

n_iter = 500   # number of full iterations

for it in range(n_iter):
    print(f"Iteration {it+1}/{n_iter}")

    for col in cols_to_impute:
        # mask based on ORIGINAL missing pattern
        missing_mask = df3[col].isna()
        if missing_mask.sum() == 0:
            continue

        observed_mask = ~missing_mask

        # use other columns as features
        other_cols = [c for c in cols_to_impute if c != col]

        X_train = df_imp.loc[observed_mask, other_cols]
        y_train = df3.loc[observed_mask, col]

        X_pred = df_imp.loc[missing_mask, other_cols]

        # Linear Regression model
        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_pred)

        # update imputed values
        df_imp.loc[missing_mask, col] = y_pred

# final result
df_final_linear = df_imp.copy()

Iteration 1/500
Iteration 2/500
Iteration 3/500
Iteration 4/500
Iteration 5/500
Iteration 6/500
Iteration 7/500
Iteration 8/500
Iteration 9/500
Iteration 10/500
Iteration 11/500
Iteration 12/500
Iteration 13/500
Iteration 14/500
Iteration 15/500
Iteration 16/500
Iteration 17/500
Iteration 18/500
Iteration 19/500
Iteration 20/500
Iteration 21/500
Iteration 22/500
Iteration 23/500
Iteration 24/500
Iteration 25/500
Iteration 26/500
Iteration 27/500
Iteration 28/500
Iteration 29/500
Iteration 30/500
Iteration 31/500
Iteration 32/500
Iteration 33/500
Iteration 34/500
Iteration 35/500
Iteration 36/500
Iteration 37/500
Iteration 38/500
Iteration 39/500
Iteration 40/500
Iteration 41/500
Iteration 42/500
Iteration 43/500
Iteration 44/500
Iteration 45/500
Iteration 46/500
Iteration 47/500
Iteration 48/500
Iteration 49/500
Iteration 50/500
Iteration 51/500
Iteration 52/500
Iteration 53/500
Iteration 54/500
Iteration 55/500
Iteration 56/500
Iteration 57/500
Iteration 58/500
Iteration 59/500
Iterat

In [None]:
df_imp = df_imp.reset_index(drop=True)
df3    = df3.reset_index(drop=True)
df     = df.reset_index(drop=True)


In [None]:
common_cols = [
    "R&D Spend",
    "Administration",
    "Marketing Spend"
]

equal_mask = (
    df_imp[common_cols].eq(df3[common_cols]) &
    df_imp[common_cols].eq(df[common_cols])
)


In [None]:
all_equal_rows = equal_mask.all(axis=1)


In [None]:
mismatch_rows = ~all_equal_rows


In [None]:
diff = pd.concat(
    {
        "df_imp": df_imp.loc[mismatch_rows, common_cols],
        "df":     df.loc[mismatch_rows, common_cols],
    },
    axis=1
)


In [None]:
diff

Unnamed: 0_level_0,df_imp,df_imp,df_imp,df,df,df
Unnamed: 0_level_1,R&D Spend,Administration,Marketing Spend,R&D Spend,Administration,Marketing Spend
4,102377.028524,91391.77,366168.42,142107.34,91391.77,366168.42
13,91992.39,124309.042316,252664.93,91992.39,135495.07,252664.93
14,119943.24,156547.42,271181.029348,119943.24,156547.42,256512.92
16,88466.513931,121597.55,264346.06,78013.11,121597.55,264346.06
19,86419.7,153514.11,206583.605249,86419.7,153514.11,0.0
20,76253.86,113867.3,230708.744614,76253.86,113867.3,298664.47
21,115509.862231,153773.43,299737.29,78389.47,153773.43,299737.29
41,27892.92,108521.137017,164470.71,27892.92,84710.77,164470.71
47,82454.829405,135426.92,218954.973472,0.0,135426.92,0.0
48,542.05,51743.15,147208.863351,542.05,51743.15,0.0
