In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_pickle('data_cleaned.pkl')

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Setting up the iterative imputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Columns to impute
columns_to_impute = ['Age', 'Annual_Income', 'Investment_Amount', 'Number_of_Active_Investments', 'Potential_Return_Rate', 'Marital_Status', 'Has_Investment_Advisor']

# Performing the imputation
imputed_data = imputer.fit_transform(df[columns_to_impute])

# Filling the original dataframe with the imputed data
df[columns_to_impute] = imputed_data

print('Iterative imputation completed.')

Iterative imputation completed.


In [4]:
df.head()

Unnamed: 0,Age,Annual_Income,Investment_Amount,Risk_Tolerance_Score,Months_of_Investing_Experience,Number_of_Active_Investments,Potential_Return_Rate,Investment_Duration,Investment_to_Income_Ratio,Education,Employment_Status,Marital_Status,Owns_Property,Has_Dependents,Investment_Sector,Has_Investment_Advisor,Investment_Failed
0,51.0,82664.208539,87300.0,358,13,3.0,18.1,12,0.85,Bachelor's,Part-time,0.996711,No,No,Other,1.0,0
1,37.0,81658.0,152598.0,792,23,4.0,14.06,24,0.86,PhD,Self-employed,0.999955,Yes,No,Auto,0.0,1
2,45.984723,82633.045587,77767.0,359,57,4.0,4.31,60,0.6,Master's,Unemployed,1.0,No,No,Education,1.0,0
3,54.0,42344.0,13261.0,366,115,1.0,13.474238,60,0.21,PhD,Part-time,0.0,No,No,Education,1.0,0
4,63.0,48898.0,193990.0,792,53,4.0,5.65,48,0.82,Master's,Self-employed,1.0,Yes,No,Business,1.0,0


In [5]:
df.to_pickle('data_imputed.pkl')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225347 entries, 0 to 225346
Data columns (total 17 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Age                             225347 non-null  float64
 1   Annual_Income                   225347 non-null  float64
 2   Investment_Amount               225347 non-null  float64
 3   Risk_Tolerance_Score            225347 non-null  int64  
 4   Months_of_Investing_Experience  225347 non-null  int64  
 5   Number_of_Active_Investments    225347 non-null  float64
 6   Potential_Return_Rate           225347 non-null  float64
 7   Investment_Duration             225347 non-null  int64  
 8   Investment_to_Income_Ratio      225347 non-null  float64
 9   Education                       225347 non-null  object 
 10  Employment_Status               225347 non-null  object 
 11  Marital_Status                  225347 non-null  float64
 12  Owns_Property   

In [7]:
df.describe()

Unnamed: 0,Age,Annual_Income,Investment_Amount,Risk_Tolerance_Score,Months_of_Investing_Experience,Number_of_Active_Investments,Potential_Return_Rate,Investment_Duration,Investment_to_Income_Ratio,Marital_Status,Has_Investment_Advisor,Investment_Failed
count,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0,225347.0
mean,46.025817,82793.977224,126997.232909,598.569708,59.832942,2.498413,13.41717,36.028756,0.499736,0.999885,0.501021,0.087212
std,17.622647,36854.489446,67876.426277,186.348988,34.617474,1.105492,6.461491,16.968128,0.230821,0.734671,0.495712,0.282146
min,18.0,15000.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0,0.0,0.0
25%,34.0,53035.5,70919.5,445.0,30.0,2.0,7.99,24.0,0.3,0.0,0.0,0.0
50%,45.997791,82793.977289,126997.232831,589.0,60.0,2.498273,13.41018,36.0,0.5,1.0,0.501553,0.0
75%,56.0,112701.0,182757.0,735.0,90.0,3.0,18.85,48.0,0.7,2.0,1.0,0.0
max,119.0,149999.0,249999.0,1099.0,119.0,4.0,25.0,60.0,0.9,2.0,1.0,1.0


In [8]:
#print how many outliers in each numeric values:
for col in df.select_dtypes(include=[np.number]).columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    print(col, 'Number of outliers:', df.loc[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0])

    

Age Number of outliers: 6721
Annual_Income Number of outliers: 0
Investment_Amount Number of outliers: 0
Risk_Tolerance_Score Number of outliers: 0
Months_of_Investing_Experience Number of outliers: 0
Number_of_Active_Investments Number of outliers: 0
Potential_Return_Rate Number of outliers: 0
Investment_Duration Number of outliers: 0
Investment_to_Income_Ratio Number of outliers: 0
Marital_Status Number of outliers: 0
Has_Investment_Advisor Number of outliers: 0
Investment_Failed Number of outliers: 19653


In [9]:
import numpy as np
from scipy.stats import zscore

# Selecting numeric columns except 'Investment_Failed'
numeric_columns = df.select_dtypes(include=[np.number]).columns.drop('Investment_Failed')

# Calculating the Z-scores of each numeric column in the dataframe except 'Investment_Failed'
z_scores = zscore(df[numeric_columns])

# Taking the absolute values of the Z-scores
abs_z_scores = np.abs(z_scores)

# Removing the rows where the Z-scores are greater than 3, excluding 'Investment_Failed'
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]

print('Outliers removed.')


Outliers removed.


In [10]:
# #apply unit vector scaling to the Age and Investment_Failure columns
# from sklearn.preprocessing import Normalizer

# # Setting up the normalizer
# normalizer = Normalizer()

# # Columns to normalize
# columns_to_normalize = ['Age', 'Investment_Failed']

# # Normalizing the data
# normalized_data = normalizer.fit_transform(df[columns_to_normalize])

# # Filling the original dataframe with the normalized data
# df[columns_to_normalize] = normalized_data

# print('Normalization completed.')

# df.head()


In [11]:
#print how many outliers in each numeric values:
for col in df.select_dtypes(include=[np.number]).columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    print(col, 'Number of outliers:', df.loc[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0])

    

Age Number of outliers: 0
Annual_Income Number of outliers: 0
Investment_Amount Number of outliers: 0
Risk_Tolerance_Score Number of outliers: 0
Months_of_Investing_Experience Number of outliers: 0
Number_of_Active_Investments Number of outliers: 0
Potential_Return_Rate Number of outliers: 0
Investment_Duration Number of outliers: 0
Investment_to_Income_Ratio Number of outliers: 0
Marital_Status Number of outliers: 0
Has_Investment_Advisor Number of outliers: 0
Investment_Failed Number of outliers: 19091


In [12]:
df.head()

Unnamed: 0,Age,Annual_Income,Investment_Amount,Risk_Tolerance_Score,Months_of_Investing_Experience,Number_of_Active_Investments,Potential_Return_Rate,Investment_Duration,Investment_to_Income_Ratio,Education,Employment_Status,Marital_Status,Owns_Property,Has_Dependents,Investment_Sector,Has_Investment_Advisor,Investment_Failed
0,51.0,82664.208539,87300.0,358,13,3.0,18.1,12,0.85,Bachelor's,Part-time,0.996711,No,No,Other,1.0,0
1,37.0,81658.0,152598.0,792,23,4.0,14.06,24,0.86,PhD,Self-employed,0.999955,Yes,No,Auto,0.0,1
2,45.984723,82633.045587,77767.0,359,57,4.0,4.31,60,0.6,Master's,Unemployed,1.0,No,No,Education,1.0,0
3,54.0,42344.0,13261.0,366,115,1.0,13.474238,60,0.21,PhD,Part-time,0.0,No,No,Education,1.0,0
4,63.0,48898.0,193990.0,792,53,4.0,5.65,48,0.82,Master's,Self-employed,1.0,Yes,No,Business,1.0,0


In [13]:
# #plot correlation matrix 
# import seaborn as sns
# import matplotlib.pyplot as plt

# corr = df.corr()
# sns.heatmap(corr, annot=True)
# plt.show()


In [14]:
#print all non numerical columns
non_numerical_columns = df.select_dtypes(include=['object']).columns
print(non_numerical_columns)


Index(['Education', 'Employment_Status', 'Owns_Property', 'Has_Dependents',
       'Investment_Sector'],
      dtype='object')


In [15]:
#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Encoding the non-numerical columns
for column in non_numerical_columns:
    df[column] = le.fit_transform(df[column])

df.head()


Unnamed: 0,Age,Annual_Income,Investment_Amount,Risk_Tolerance_Score,Months_of_Investing_Experience,Number_of_Active_Investments,Potential_Return_Rate,Investment_Duration,Investment_to_Income_Ratio,Education,Employment_Status,Marital_Status,Owns_Property,Has_Dependents,Investment_Sector,Has_Investment_Advisor,Investment_Failed
0,51.0,82664.208539,87300.0,358,13,3.0,18.1,12,0.85,0,1,0.996711,0,0,4,1.0,0
1,37.0,81658.0,152598.0,792,23,4.0,14.06,24,0.86,3,3,0.999955,2,0,0,0.0,1
2,45.984723,82633.045587,77767.0,359,57,4.0,4.31,60,0.6,2,4,1.0,0,0,2,1.0,0
3,54.0,42344.0,13261.0,366,115,1.0,13.474238,60,0.21,3,1,0.0,0,0,2,1.0,0
4,63.0,48898.0,193990.0,792,53,4.0,5.65,48,0.82,2,3,1.0,2,0,1,1.0,0


In [16]:
#implement KNN model to predict the Investment_Failed column
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Splitting the data into the features and the target
X = df.drop('Investment_Failed', axis=1)
y = df['Investment_Failed']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Setting up the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Fitting the model to the training data
knn.fit(X_train, y_train)

# Predicting the target values
y_pred = knn.predict(X_test)

# Calculating the accuracy of the model
accuracy = (y_pred == y_test).mean()

print('Accuracy:', accuracy)


Accuracy: 0.9071719343182546
