In [None]:
# Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost diffprivlib cryptography
from diffprivlib.models import LogisticRegression as DPLogisticRegression
from sklearn.model_selection import train_test_split
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, HistGradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from google.colab import files
uploaded = files.upload()

df = pd.read_csv('Churn_Modelling.csv', delimiter=',')
df.shape

# Check columns list and missing values
df.isnull().sum()

# Get unique count for each variable
df.nunique()

# Drop the columns as explained above
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)

# Review the top rows of what is left of the data frame
df.head()


# Check variable data types
df.dtypes

labels = 'Exited', 'Retained'
sizes = [df.Exited[df['Exited']==1].count(), df.Exited[df['Exited']==0].count()]
explode = (0, 0.1)
fig1, ax1 = plt.subplots(figsize=(10, 8))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title("Proportion of customer churned and retained", size = 20)
plt.show()

# Categorical variables analysis
fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
sns.countplot(x='Geography', hue = 'Exited',data = df, ax=axarr[0][0])
sns.countplot(x='Gender', hue = 'Exited',data = df, ax=axarr[0][1])
sns.countplot(x='HasCrCard', hue = 'Exited',data = df, ax=axarr[1][0])
sns.countplot(x='IsActiveMember', hue = 'Exited',data = df, ax=axarr[1][1])

# Continuous variables analysis
fig, axarr = plt.subplots(3, 2, figsize=(20, 12))
sns.boxplot(y='CreditScore',x = 'Exited', hue = 'Exited',data = df, ax=axarr[0][0])
sns.boxplot(y='Age',x = 'Exited', hue = 'Exited',data = df , ax=axarr[0][1])
sns.boxplot(y='Tenure',x = 'Exited', hue = 'Exited',data = df, ax=axarr[1][0])
sns.boxplot(y='Balance',x = 'Exited', hue = 'Exited',data = df, ax=axarr[1][1])
sns.boxplot(y='NumOfProducts',x = 'Exited', hue = 'Exited',data = df, ax=axarr[2][0])
sns.boxplot(y='EstimatedSalary',x = 'Exited', hue = 'Exited',data = df, ax=axarr[2][1])


# Split Train, test data
df_train = df.sample(frac=0.8,random_state=200)
df_test = df.drop(df_train.index)
print(len(df_train))
print(len(df_test))

df_train['BalanceSalaryRatio'] = df_train.Balance/df_train.EstimatedSalary
sns.boxplot(y='BalanceSalaryRatio',x = 'Exited', hue = 'Exited',data = df_train)
plt.ylim(-1, 5)


# Introduce new features
df_train['TenureByAge'] = df_train.Tenure/(df_train.Age)
sns.boxplot(y='TenureByAge',x = 'Exited', hue = 'Exited',data = df_train)
plt.ylim(-1, 1)
plt.show()

df_train['CreditScoreGivenAge'] = df_train.CreditScore/(df_train.Age)


# Resulting Data Frame
df_train.head()


df_train = df.sample(frac=0.8, random_state=200)
df_test = df.drop(df_train.index)


# Feature engineering
df_train['BalanceSalaryRatio'] = df_train.Balance / df_train.EstimatedSalary
df_train['TenureByAge'] = df_train.Tenure / df_train.Age
df_train['CreditScoreGivenAge'] = df_train.CreditScore / df_train.Age


# Arrange columns by data type
continuous_vars = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']  # Removed the engineered features
cat_vars = ['HasCrCard', 'IsActiveMember', 'Geography', 'Gender']

# Select the desired columns from the original DataFrame 'df'
df_train = df[['Exited'] + continuous_vars + cat_vars].copy()

# Feature engineering - Now recreate these columns after selecting from 'df'
df_train.loc[:, 'BalanceSalaryRatio'] = df_train.Balance / df_train.EstimatedSalary
df_train.loc[:, 'TenureByAge'] = df_train.Tenure / df_train.Age
df_train.loc[:, 'CreditScoreGivenAge'] = df_train.CreditScore / df_train.Age

df_train.head()


'''For the one hot variables, we change 0 to -1 so that the models can capture a negative relation
where the attribute in inapplicable instead of 0'''
df_train.loc[df_train.HasCrCard == 0, 'HasCrCard'] = -1
df_train.loc[df_train.IsActiveMember == 0, 'IsActiveMember'] = -1
df_train.head()


# One hot encode the categorical variables
lst = ['Geography', 'Gender']
remove = list()
for i in lst:
    # Use 'object' instead of 'np.str' or 'np.object'
    if (df_train[i].dtype == object):
        for j in df_train[i].unique():
            df_train[i+'_'+j] = np.where(df_train[i] == j,1,-1)
        remove.append(i)
df_train = df_train.drop(remove, axis=1)
df_train.head()


# MinMax scaling the continuous variables
minVec = df_train[continuous_vars].min().copy()
maxVec = df_train[continuous_vars].max().copy()
df_train[continuous_vars] = (df_train[continuous_vars]-minVec)/(maxVec-minVec)
df_train.head()


# data prep pipeline for test data
def DfPrepPipeline(df_predict,df_train_Cols,minVec,maxVec):
    # Add new features
    df_predict['BalanceSalaryRatio'] = df_predict.Balance/df_predict.EstimatedSalary
    df_predict['TenureByAge'] = df_predict.Tenure/(df_predict.Age - 18)
    df_predict['CreditScoreGivenAge'] = df_predict.CreditScore/(df_predict.Age - 18)

# Reorder the columns
continuous_vars = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'BalanceSalaryRatio',
                   'TenureByAge', 'CreditScoreGivenAge']
cat_vars = ['HasCrCard', 'IsActiveMember', "Geography", "Gender"]


# Calculate new features for df_test
df_test['BalanceSalaryRatio'] = df_test.Balance / df_test.EstimatedSalary
df_test['TenureByAge'] = df_test.Tenure / (df_test.Age - 18)
df_test['CreditScoreGivenAge'] = df_test.CreditScore / (df_test.Age - 18)

# Create the test DataFrame
df_predict = df_test[['Exited'] + continuous_vars + cat_vars]

df_predict.head()


df_predict.loc[df_predict.HasCrCard == 0, 'HasCrCard'] = -1
df_predict.loc[df_predict.IsActiveMember == 0, 'IsActiveMember'] = -1


# One hot encode the categorical variables
lst = ["Gender","Geography"]  # Removed 'Geography' as it's already processed
remove = list()


# One hot encode the categorical variables
lst = ["Gender","Geography"]  # Removed 'Geography' as it's already processed
remove = list()

for i in lst:
    # Check if column exists before proceeding
    if i in df_train.columns and df_train[i].dtype == object:
        for j in df_train[i].unique():
            df_train[i + '_' + j] = np.where(df_train[i] == j, 1, -1)
        remove.append(i)

# Remove columns outside the loop to avoid issues
df_train = df_train.drop(remove, axis=1)

# MinMax scaling
minVec = df_train[continuous_vars].min().copy()
maxVec = df_train[continuous_vars].max().copy()
df_train[continuous_vars] = (df_train[continuous_vars] - minVec) / (maxVec - minVec)
print(df_train.head())


# Ensure that all one hot encoded variables that appear in the train data appear in the test data
L = list(set(df_train.columns) - set(df_predict.columns))
for l in L:
    df_predict[str(l)] = -1

# MinMax scaling the continuous variables based on min and max from the train data
df_predict[continuous_vars] = (df_predict[continuous_vars] - minVec) / (maxVec - minVec)

# Ensure that the variables are ordered in the same way as was ordered in the train set
df_predict = df_predict[df_train.columns] # Use 'columns' instead of 'column'


# Support functions
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

# Fit models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    print(f"{model.__class__.__name__}")
    print(f"Accuracy: {accuracy}")
    print(f"AUC Score: {auc}")
    print()

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(df_train.loc[:, df_train.columns != 'Exited'], df_train.Exited, test_size = 0.2, random_state = 0)


# Implementing Differential Privacy in Logistic Regression
dp_lr = DPLogisticRegression(epsilon=1.0)
dp_lr.fit(X_train, y_train)
dp_y_pred = dp_lr.predict(X_val)


# Model Evaluation
dp_acc = accuracy_score(y_val, dp_y_pred)
dp_auc = roc_auc_score(y_val, dp_y_pred)
print("Differentially Private Logistic Regression")
print(f"Accuracy: {dp_acc}")
print(f"AUC Score: {dp_auc}")
print()


#Function for Differential Privacy
def dp_noise_addition(data, epsilon=1.0):
    noise = np.random.laplace(loc=0.0, scale=1/epsilon, size=data.shape)
    return data + noise



# Adding noise to sensitive columns
df_train_noised = df_train.copy()
sensitive_columns = ['CreditScore', 'Balance', 'EstimatedSalary']
df_train_noised[sensitive_columns] = dp_noise_addition(df_train[sensitive_columns], epsilon=1.0)


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_train_noised.loc[:, df_train_noised.columns != 'Exited'])
X_val_scaled = scaler.transform(X_val)


# Train the model
lr = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
lr.fit(X_train_scaled, df_train_noised['Exited'])
lr_y_pred = lr.predict(X_val_scaled)


# Model Evaluation
acc = accuracy_score(y_val, lr_y_pred)
auc = roc_auc_score(y_val, lr_y_pred)
print("Standard Logistic Regression with Noised Data")
print(f"Accuracy: {acc}")
print(f"AUC Score: {auc}")
print()


# Train and evaluate different models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier # Import KNeighborsClassifier

models = [
    LogisticRegression(),
    SVC(probability=True),
    RandomForestClassifier(),
    XGBClassifier(),
    KNeighborsClassifier(), # Now you can use it
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    ExtraTreesClassifier()
]


for model in models:
    evaluate_model(model, X_train, y_train, X_val, y_val)

# Function for Differential Privacy
def dp_noise_addition(data, epsilon=1.0):
    noise = np.random.laplace(scale=1/epsilon, size=data.shape)
    return data + noise

# Sample usage of Differential Privacy
df_train_private = df_train.copy()
df_train_private.loc[:, df_train_private.columns != 'Exited'] = dp_noise_addition(df_train_private.loc[:, df_train_private.columns != 'Exited'])


from sklearn.ensemble import BaggingClassifier, HistGradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier, Perceptron, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from diffprivlib.models import LogisticRegression as DPLogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.exceptions import ConvergenceWarning # Import ConvergenceWarning
import warnings

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)


# Initialize models
models = {
    'Differentially Private Logistic Regression': DPLogisticRegression(epsilon=1.0, data_norm=1.0),
    'Logistic Regression with Noised Data': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB(),
    'Extra Trees': ExtraTreesClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVC': SVC(probability=True),
    'XGBoost': XGBClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
    'SGDClassifier': SGDClassifier(),
    'Perceptron': Perceptron(),
    'RidgeClassifier': RidgeClassifier(),
    'PassiveAggressiveClassifier': PassiveAggressiveClassifier()
}


# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


# Train and evaluate each model
for name, model in models.items():
    if 'Differentially Private' in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
    elif 'Noised Data' in name:
        model.fit(df_train_noised.loc[:, df_train_noised.columns != 'Exited'], df_train_noised['Exited'])
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)

    acc = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)
    print(f"{name} - Accuracy: {acc}, AUC: {auc}")


# Adding Voting Classifier for ensemble
voting_clf = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier()),
    ('gnb', GaussianNB())],
    voting='soft')
voting_clf.fit(X_train_scaled, y_train)
y_pred = voting_clf.predict(X_val_scaled)
acc = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)
print(f"Voting Classifier - Accuracy: {acc}, AUC: {auc}")



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings
from cryptography.fernet import Fernet
import hashlib
import requests

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Encryption setup
def generate_key():
    return Fernet.generate_key()

def encrypt_data(data, key):
    fernet = Fernet(key)
    encrypted = fernet.encrypt(data.encode())
    return encrypted

def decrypt_data(encrypted_data, key):
    fernet = Fernet(key)
    decrypted = fernet.decrypt(encrypted_data).decode()
    return decrypted

# Placeholder for loading and decrypting data
def load_data(file_path, key):
    df = pd.read_csv(file_path)
    # Decrypt sensitive columns if necessary
    return df

# Define roles and permissions
roles_permissions = {
    'Admin': ['view_data', 'modify_data', 'encrypt_data', 'decrypt_data'],
    'User': ['view_data']
}

# Log list to store access attempts
access_log = []

# Function to check if a user has the required permission and log the attempt
def check_access(user, permission):
    role = user_roles.get(user)
    authorized = role and permission in roles_permissions.get(role, [])
    access_log.append({'user': user, 'permission': permission, 'authorized': authorized})
    return authorized

# Function to review access logs and identify unauthorized access attempts
def review_access_logs():
    print("\nReviewing Access Logs:")
    unauthorized_attempts = [log for log in access_log if not log['authorized']]
    if unauthorized_attempts:
        print("Unauthorized Access Attempts Found:")
        for attempt in unauthorized_attempts:
            print(f"User: {attempt['user']}, Permission: {attempt['permission']}, Authorized: {attempt['authorized']}")
    else:
        print("No unauthorized access attempts found.")

# Analyze data integrity
def analyze_data_integrity(df, user):
    if not check_access(user, 'view_data'):
        print(f"Access denied for user {user} to perform data integrity analysis.")
        return

    print("Data Integrity Analysis:")
    # Example: Checking for duplicates
    if df.duplicated().sum() > 0:
        print("Warning: Duplicate records found in the data.")
    else:
        print("No duplicate records found.")

    # Example: Hashing and comparing
    original_hash = hashlib.sha256(pd.util.hash_pandas_object(df).values).hexdigest()
    print(f"Data Hash: {original_hash}")

# Security risk analysis
def analyze_security_risks(df, user):
    if not check_access(user, 'view_data'):
        print(f"Access denied for user {user} to perform security risk analysis.")
        return

    print("Security Risk Analysis:")

    # Check for anomalous behavior or malicious patterns
    # Example: Simple anomaly detection based on statistical measures
    print("Checking for anomalous behavior or malicious patterns:")
    if (df.describe().loc['std'] > 100).any():  # This is a simplified example
        print("Warning: High variability detected in some columns, which may indicate anomalies.")

    # Malware detection: Checking for known malware signatures
    print("Malware Detection:")
    known_malware_signatures = ["ransomware_signature", "trojan_signature", "spyware_signature", "adware_signature"]
    detected_malware = [sig for sig in known_malware_signatures if check_for_malware_signature(df, sig)]
    if detected_malware:
        print(f"Detected Malware Signatures: {detected_malware}")
    else:
        print("No known malware signatures detected.")

    # Example: Check for access control issues
    print("Access Control Analysis:")
    # Placeholder: You might want to log access details here
    print("Access controls are in place to limit access to sensitive data.")

# Example function to check for malware signature (simplified)
def check_for_malware_signature(df, signature):
    # Simplified example: Check if the signature is in any of the string columns
    return any(df[col].astype(str).str.contains(signature).any() for col in df.select_dtypes(include=[object]).columns)

# Mitigation strategies
def apply_mitigation(df_train, key, user):
    if not check_access(user, 'encrypt_data'):
        print(f"Access denied for user {user} to apply mitigation strategies.")
        return

    print("Applying Mitigation Strategies:")
    # Encrypt sensitive columns
    key = generate_key()
    print(f"Encryption Key: {key.decode()}")
    # Save the key securely; here it is just printed for demonstration

    # Placeholder: Encrypting data (example)
    df_train_noised = df_train.copy()
    df_train_noised['EncryptedBalance'] = df_train_noised['Balance'].apply(lambda x: encrypt_data(str(x), key))

    # Implementing access controls
    print("1. Implementing Role-Based Access Controls.")
    print("2. Encrypting sensitive data.")
    print("3. Conducting regular security audits.")
    print("4. Adversarial training for model robustness.")

# Define test users and actions
test_users = [
    {'user': 'alice', 'role': 'Admin'},  # Admin user
    {'user': 'bob', 'role': 'User'},     # Regular user
    {'user': 'eve', 'role': 'User'},     # Regular user trying unauthorized actions
]

# Assign roles to test users
user_roles = {user['user']: user['role'] for user in test_users}

# Function to perform test actions
def perform_test_actions():
    print("\nPerforming Test Actions:")

    # Admin user performs all actions
    current_user = 'alice'
    print(f"\nTesting actions for {current_user} (Admin):")
    analyze_data_integrity(df, current_user)
    analyze_security_risks(df, current_user)
    apply_mitigation(df, key, current_user)

    # Regular user performs allowed and unauthorized actions
    current_user = 'bob'
    print(f"\nTesting actions for {current_user} (User):")
    analyze_data_integrity(df, current_user)
    analyze_security_risks(df, current_user)
    apply_mitigation(df, key, current_user)  # Should be unauthorized

    # Another regular user performs unauthorized actions
    current_user = 'eve'
    print(f"\nTesting actions for {current_user} (User):")
    analyze_data_integrity(df, current_user)
    apply_mitigation(df, key, current_user)  # Should be unauthorized

from google.colab import files
uploaded = files.upload()


# Example usage
key = generate_key()
df = load_data('Churn_Modelling.csv', key)  # Adjust as needed for encrypted data

# Execute test actions
perform_test_actions()

# Review and report unauthorized access attempts
review_access_logs()
