In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import re
import numpy as np

In [None]:
df = pd.read_csv('new.csv')
print(df.tail())

def convert_filesize_to_kb(filesize_str):
    filesize_str = filesize_str.upper()
    size_value_str = re.findall(r'[\d\.]+', filesize_str)[0]
    size_value = float(size_value_str)
    if 'KB' in filesize_str:
        return size_value
    elif 'MB' in filesize_str:
        return size_value * 1024
    elif 'GB' in filesize_str:
        return size_value * 1024 * 1024
    else:
        return size_value / 1024
df['filesize_kb'] = df['filesize'].apply(convert_filesize_to_kb)

y = df['confidentiality_score']

features = ['filename', 'completefilepath', 'fileextension', 'filesize_kb'] # Using raw columns + filesize_kb as features
X = df[features]

X = pd.get_dummies(X, columns=['filename', 'completefilepath', 'fileextension']) # One-hot encode categorical features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LinearRegression()
logistic_model.fit(X_train, y_train)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

print("--- Linear Regression Model ---")
y_pred_logistic = logistic_model.predict(X_test)
mse_logistic = mean_squared_error(y_test, y_pred_logistic)
r2_logistic = r2_score(y_test, y_pred_logistic)
mae_logistic = mean_absolute_error(y_test, y_pred_logistic)

print(f"Mean Squared Error (MSE): {mse_logistic:.4f}")
print(f"R-squared (R2): {r2_logistic:.4f}")
print(f"Mean Absolute Error (MAE): {mae_logistic:.4f}")


print("\n--- Random Forest Regressor Model ---")
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"R-squared (R2): {r2_rf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")

training_columns = X_train.columns

if isinstance(rf_model, RandomForestRegressor):
    feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_}) # Feature names from encoded X_train
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    print("\n--- Random Forest Feature Importances ---")
    print(feature_importances.head(10)) # Print top 10 feature importances

                          filename  \
773         temp_data_sheet_v2.csv   
774           random_photo_v3.jpeg   
775     temp_word_document_v2.docx   
776            random_notes_v3.txt   
777  temp_backup_project_final.zip   

                                      completefilepath fileextension  \
773                    /var/tmp/temp_data_sheet_v2.csv           csv   
774  C:\Users\user\Pictures\RandomPhotosV3\random_p...          jpeg   
775  /Users/user/Documents/TempWordDocumentV2/temp_...          docx   
776  /home/user/Documents/TempNotesV3/random_notes_...           txt   
777  C:\Users\user\Desktop\TempProjectBackupFinal\t...           zip   

        created_date_utc    lastmodified_date filesize  confidentiality_score  
773  2024-06-05 02:00:00  2024-06-05 02:00:00      6MB                   0.25  
774  2024-04-20 19:00:00  2024-04-20 19:00:00    900KB                   0.15  
775  2024-03-30 15:00:00  2024-03-30 15:00:00    700KB                   0.35  
776  2024-05-30 23

In [None]:
def predict_file_sensitivity(filename, completefilepath, fileextension, filesize_str, logistic_model, rf_model, features_list, training_columns): # Added training_columns argument
    """
    Predicts the confidentiality score of a file using trained Linear Regression and Random Forest Regressor models,
    using RAW features (filename, completefilepath, fileextension, filesize).
    """
    input_data = pd.DataFrame([{
        'filename': filename,
        'completefilepath': completefilepath,
        'fileextension': fileextension,
        'filesize': filesize_str
    }])

    input_data['filesize_kb'] = input_data['filesize'].apply(convert_filesize_to_kb)

    input_features = pd.get_dummies(input_data, columns=['filename', 'completefilepath', 'fileextension'])

    input_features = input_features.reindex(columns=training_columns, fill_value=0)

    logistic_prediction = logistic_model.predict(input_features)
    rf_prediction = rf_model.predict(input_features)

    return {
        "linear_regression_prediction": logistic_prediction[0],
        "random_forest_prediction": rf_prediction[0],
        "raw_input_features": input_data.to_dict('records')[0]
    }

In [None]:
if __name__ == '__main__':
  file_data = [
    ["john_doe_aadhar.pdf", "C:\\Users\\John\\Documents\\ID_Cards\\", "pdf", "250KB"],
    ["pancard_scan.png", "/home/user/private/docs/", "png", "350KB"],
    ["bank_statement_2024.pdf", "D:\\Financial\\Statements\\", "pdf", "1.2MB"],
    ["passwords_backup.txt", "C:\\Users\\Admin\\Desktop\\", "txt", "5KB"],
    ["secrets_env.key", "/etc/ssl/private/", "key", "2KB"],
    ["medical_record_jane.pdf", "E:\\Health\\Records\\", "pdf", "500KB"],
    ["salary_slip_march.docx", "D:\\HR\\Payroll\\", "docx", "300KB"],
    ["company_secrets.docx", "/home/user/confidential/", "docx", "100KB"],
    ["private_notes.csv", "C:\\Users\\Admin\\Downloads\\", "csv", "120KB"],
    ["id_verification_passport.jpg", "F:\\Documents\\Government_IDs\\", "jpg", "450KB"],
    ["server_logs.log", "/var/logs/", "log", "2MB"],
    ["user_manual.pdf", "C:\\Program Files\\App\\", "pdf", "1MB"],
    ["meeting_notes.docx", "C:\\Users\\Employee\\Documents\\", "docx", "600KB"],
    ["todo_list.txt", "/home/user/", "txt", "2KB"],
    ["grocery_list.csv", "D:\\Shopping\\", "csv", "5KB"],
    ["holiday_pictures.png", "E:\\Photos\\", "png", "700KB"],
    ["resume_john_doe.pdf", "/home/user/Documents/", "pdf", "400KB"],
    ["project_plan.xlsx", "C:\\Users\\ProjectManager\\Work\\", "xlsx", "800KB"],
    ["bank_details.docx", "D:\\Finance\\", "docx", "250KB"],
    ["internal_memo.pdf", "C:\\Company\\HR\\", "pdf", "300KB"],
    ["id_card_scan.jpg", "F:\\Security\\ID\\", "jpg", "500KB"],
    ["server_backup.key", "/etc/security/", "key", "4KB"],
    ["client_contract.pdf", "D:\\Legal\\", "pdf", "900KB"],
    ["biometric_data.csv", "C:\\Users\\Admin\\Bio\\", "csv", "350KB"],
    ["application_form.docx", "C:\\Users\\Applications\\", "docx", "200KB"],
    ["web_cache.log", "/var/www/cache/", "log", "5MB"],
    ["license_key.pem", "/home/user/licenses/", "pem", "3KB"],
    ["expense_report.xlsx", "D:\\Finance\\Reports\\", "xlsx", "1MB"],
    ["daily_journal.txt", "/home/user/docs/", "txt", "2KB"],
    ["employee_salaries.csv", "C:\\HR\\Payroll\\", "csv", "1.5MB"],
    ["tax_filing_2024.pdf", "D:\\Documents\\Tax\\", "pdf", "800KB"],
    ["customer_feedback.docx", "E:\\Customer_Service\\", "docx", "450KB"],
    ["medical_prescription.png", "F:\\Health\\", "png", "600KB"],
    ["passport_scan.pdf", "C:\\Users\\John\\Travel\\", "pdf", "750KB"],
    ["meeting_agenda.docx", "/home/office/meetings/", "docx", "200KB"],
    ["software_update.log", "/var/system/logs/", "log", "3MB"],
    ["driver_license.jpg", "D:\\Identity\\", "jpg", "300KB"],
    ["employee_reviews.docx", "C:\\HR\\Performance\\", "docx", "500KB"],
    ["confidential_budget.xlsx", "D:\\Finance\\", "xlsx", "2MB"],
    ["insurance_policy.pdf", "/home/user/insurance/", "pdf", "850KB"],
    ["server_config.key", "/etc/server/security/", "key", "1KB"],
    ["home_loan_agreement.docx", "D:\\Loans\\", "docx", "950KB"],
    ["personal_notes.txt", "C:\\Users\\John\\Desktop\\", "txt", "10KB"],
    ["business_proposal.pdf", "D:\\Work\\Proposals\\", "pdf", "1.1MB"],
    ["academic_records.csv", "/home/user/education/", "csv", "400KB"],
    ["photo_album.png", "E:\\Gallery\\", "png", "1MB"],
    ["script_backup.pem", "/home/user/scripts/", "pem", "2KB"],
    ["shipment_tracking.log", "/var/logs/shipping/", "log", "6MB"],
    ["network_diagram.jpg", "D:\\IT\\Networking\\", "jpg", "750KB"]
]
  for data in file_data:
    filename, completefilepath, fileextension, filesize_str = data

    test_filename = filename
    test_filepath = completefilepath
    test_fileextension = fileextension
    test_filesize = filesize_str

    predictions = predict_file_sensitivity(test_filename, test_filepath, test_fileextension, test_filesize, logistic_model, rf_model, features, training_columns) # Pass training_columns here

    print("\n--- Prediction for Input File ---")
    print(f"Filename: {test_filename}")
    print(f"Filepath: {test_filepath}")
    print(f"File Extension: {test_fileextension}")
    print(f"Filesize: {test_filesize}")

    print("\n--- Model Predictions ---")
    print(f"Linear Regression Prediction (Confidentiality Score): {predictions['linear_regression_prediction']:.4f}")
    print(f"Random Forest Prediction (Confidentiality Score): {predictions['random_forest_prediction']:.4f}")

    print("\n--- Raw Input Features Used for Prediction ---")
    for feature, value in predictions['raw_input_features'].items():
        print(f"  {feature}: {value}")


    # print("\n--- Test Case 2 ---")
    # test_filename_2 = "bhavyam_adhaar.jpg"
    # test_filepath_2 = "/home/user/Documents/personal_info/bhavyam_adhaar.jpg"
    # test_fileextension_2 = "jpg"
    # test_filesize_2 = "145KB"
    # predictions_2 = predict_file_sensitivity(test_filename_2, test_filepath_2, test_fileextension_2, test_filesize_2, logistic_model, rf_model, features, training_columns) # Pass training_columns here

    # print(f"Filename: {test_filename_2}")
    # print(f"Filepath: {test_filepath_2}")
    # print(f"File Extension: {test_fileextension_2}")
    # print(f"Filesize: {test_filesize_2}")
    # print("\n--- Model Predictions ---")
    # print(f"Linear Regression Prediction (Confidentiality Score): {predictions_2['linear_regression_prediction']:.4f}")
    # print(f"Random Forest Prediction (Confidentiality Score): {predictions_2['random_forest_prediction']:.4f}")
    # print("\n--- Raw Input Features Used for Prediction ---")
    # for feature, value in predictions_2['raw_input_features'].items():
    #     print(f"  {feature}: {value}")


--- Prediction for Input File ---
Filename: john_doe_aadhar.pdf
Filepath: C:\Users\John\Documents\ID_Cards\
File Extension: pdf
Filesize: 250KB

--- Model Predictions ---
Linear Regression Prediction (Confidentiality Score): 0.6149
Random Forest Prediction (Confidentiality Score): 0.7689

--- Raw Input Features Used for Prediction ---
  filename: john_doe_aadhar.pdf
  completefilepath: C:\Users\John\Documents\ID_Cards\
  fileextension: pdf
  filesize: 250KB
  filesize_kb: 250.0

--- Prediction for Input File ---
Filename: pancard_scan.png
Filepath: /home/user/private/docs/
File Extension: png
Filesize: 350KB

--- Model Predictions ---
Linear Regression Prediction (Confidentiality Score): 0.3383
Random Forest Prediction (Confidentiality Score): 0.4366

--- Raw Input Features Used for Prediction ---
  filename: pancard_scan.png
  completefilepath: /home/user/private/docs/
  fileextension: png
  filesize: 350KB
  filesize_kb: 350.0

--- Prediction for Input File ---
Filename: bank_statem

In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from fuzzywuzzy import fuzz
import re

SENSITIVE_FILES = [
    "passwords.txt", "credentials.json", "secrets.env", "config.ini", "apikeys.txt",
    "ssh_keys.pem", "id_rsa", ".htpasswd", ".bash_history", "shadow", "passwd",

    "bank_statement.pdf", "credit_card_details.csv", "account_details.xlsx", "transaction_logs.db",
    "loan_agreement.docx", "rbi_compliance_report.pdf", "balance_sheet.xls", "income_tax_return.pdf",
    "gst_filing.docx", "audit_report.pdf", "financial_statements.csv", "pan_card.pdf",
    "aadhaar_details.pdf", "tax_payers_info.xlsx", "investment_portfolio.docx", "salary_slip.pdf",
    "payroll_data.xlsx", "form_16.pdf", "cheque_scan.jpeg", "bank_account_info.txt",

    "patient_records.db", "medical_history.pdf", "health_insurance_policy.docx", "prescriptions.txt",
    "hospital_bills.csv", "covid_vaccine_certificate.pdf", "clinical_test_results.xls", "doctor_notes.docx",
    "hipaa_compliance_report.pdf", "pharmaceutical_research_data.docx",

    "govt_confidential.docx", "legal_agreement.pdf", "court_case_files.db", "passport_scan.jpg",
    "visa_application.pdf", "national_id_card.pdf", "election_voter_list.csv", "law_enforcement_data.docx",
    "classified_documents.pdf", "criminal_records.db", "army_secret_memo.docx", "diplomatic_correspondence.txt",
    "rti_responses.pdf","aadhar_card.pdf", "aadhaar_details.pdf", "pan_card.pdf", "passport_scan.jpg",
    "voter_id_card.pdf", "driving_license.jpg", "ration_card.pdf", "election_card.pdf",
    "national_identity_card.pdf", "jan_aadhaar_card.pdf",
    "ssc_marksheet.pdf", "hsc_marksheet.pdf", "graduation_degree.pdf",
    "post_graduation_certificate.pdf", "professional_certification.pdf",
    "school_leaving_certificate.pdf", "college_id_card.pdf",
    "medical_history.pdf", "health_insurance_policy.pdf", "vaccination_certificate.pdf",
    "covid_test_report.pdf", "doctor_prescription.pdf", "blood_donor_card.pdf",

    "marriage_certificate.pdf", "birth_certificate.pdf", "death_certificate.pdf",
    "adoption_papers.pdf", "family_tree_document.pdf",

    "company_strategy.pptx", "m&a_documents.pdf", "ipo_prospectus.docx", "nda_signed_agreement.pdf",
    "business_partners_list.csv", "trade_secrets.docx", "financial_forecast.xlsx", "market_research_reports.pdf",
    "client_contracts.docx", "rfp_response.docx", "competitive_analysis.xlsx", "employee_database.db",
    "company_policies.pdf", "intellectual_property.docx", "business_plans.ppt", "board_meeting_minutes.pdf",
    "social_security_numbers.csv", "driving_license_scan.jpg", "adhar_card_scan.pdf",
    "passport_details.txt", "personal_diary.docx", "home_address_list.txt", "email_contacts.csv",
    "phone_numbers.xlsx", "marriage_certificate.pdf", "birth_certificate.pdf", "voter_id_card.pdf",
    "personal_budget.xlsx",

    "server_config.yaml", "database_backup.sql", "log4j_logs.log", "aws_secrets.json",
    "azure_key_vault.txt", "google_cloud_keys.env", "system_logs.log", "firewall_rules.conf",
    "vpn_config.ovpn", "docker_compose.yml", ".env", "tls_certificate.crt", "private_key.pem"
]

SENSITIVE_KEYWORDS = [
    "password", "passcode", "passwd", "secret", "credential", "auth", "apikey",
    "ssh_key", "jwt", "encryption_key", "decryption_key", "private_key", "id_rsa",
    "access_token", "session_token", "security_question", "otp", "2fa", "mfa",
    "certificate", "ssl_certificate", "tls_key", "keystore", "firewall_rule",

    "bank_statement", "credit_card", "debit_card", "loan_agreement", "transaction_history",
    "tax_return", "income_tax", "irs_filing", "gst_filing", "audit_report", "account_number",
    "routing_number", "cheque_scan", "ifsc_code", "swift_code", "iban_number",
    "balance_sheet", "financial_statement", "investment_portfolio", "salary_slip",
    "form_16", "payroll", "pension_statement", "insurance_policy", "mutual_fund",
    "mortgage", "financial_forecast", "credit_report", "debt_repayment", "wire_transfer",

    "medical_record", "health_insurance", "insurance_claim", "covid_certificate",
    "doctor_prescription", "patient_history", "clinical_test", "hipaa_compliance",
    "hospital_bill", "diagnostic_report", "lab_results", "blood_test", "vaccination_card",

    "aadhar", "aadhaar", "pan_card", "passport", "voter_id", "driving_license",
    "ssn", "social_security", "itin", "national_id", "govt_confidential",
    "law_enforcement", "court_case", "visa_application", "election_list",
    "citizenship_certificate", "diplomatic_document", "classified_data", "rti_response",
    "police_verification", "criminal_record", "immigration_papers", "residency_proof",

    "company_policy", "business_plan", "nda_signed", "ipo_prospectus",
    "board_meeting_minutes", "client_contract", "rfp_response", "trade_secret",
    "market_research", "competitive_analysis", "m&a_document", "financial_projections",
    "business_strategy", "supplier_agreement", "confidential_memo",

    "birth_certificate", "marriage_certificate", "death_certificate", "personal_diary",
    "home_address", "contact_list", "phone_number", "email_list", "customer_data",
    "ssn_scan", "adhar_scan", "passport_details", "personal_budget",

    "server_config", "database_backup", "log4j_log", "aws_secret", "azure_key",
    "gcp_key", "vpn_config", "firewall_rules", "docker_compose", ".env",
    "tls_certificate", "system_logs", "system_config", "dns_zone", "domain_credentials",
    "database_dump", "system_backup", "backup_file", "error_log", "debug_log",
    "syslog", "server_logs", "system_state", "snapshot", "archive",

    "whistleblower", "witness_protection", "espionage", "government_sanctioned",
    "data_breach", "cybersecurity_report", "intellectual_property"
]

SENSITIVE_EXTENSIONS = [".pem", ".key", ".env", ".db", ".sql", ".bak"]

SENSITIVE_DIRECTORIES = ["/etc/", "/var/log/", "/home/user/private/"]

THRESHOLD = 80

def normalize_filename(filename):
    return re.sub(r'[\W_]+', ' ', filename).strip().lower()

def is_sensitive_fuzzy(filename, filepath):

    clean_filename = normalize_filename(filename)
    for keyword in SENSITIVE_KEYWORDS:
        if keyword in clean_filename:
            return True


    for sensitive_file in SENSITIVE_FILES:
        clean_sensitive_file = normalize_filename(sensitive_file)

        if (fuzz.ratio(clean_filename, clean_sensitive_file) >= THRESHOLD or
            fuzz.partial_ratio(clean_filename, clean_sensitive_file) >= THRESHOLD or
            fuzz.token_set_ratio(clean_filename, clean_sensitive_file) >= THRESHOLD):
            return True

    if any(filename.endswith(ext) for ext in SENSITIVE_EXTENSIONS):
        return True

    if any(filepath.startswith(dir_path) for dir_path in SENSITIVE_DIRECTORIES):
        return True

    return False

file_data = [
    ["john_doe_aadhar.pdf", "C:\\Users\\John\\Documents\\ID_Cards\\", "pdf", "250KB"],
    ["pancard_scan.png", "/home/user/private/docs/", "png", "350KB"],
    ["bank_statement_2024.pdf", "D:\\Financial\\Statements\\", "pdf", "1.2MB"],
    ["passwords_backup.txt", "C:\\Users\\Admin\\Desktop\\", "txt", "5KB"],
    ["secrets_env.key", "/etc/ssl/private/", "key", "2KB"],
    ["medical_record_jane.pdf", "E:\\Health\\Records\\", "pdf", "500KB"],
    ["salary_slip_march.docx", "D:\\HR\\Payroll\\", "docx", "300KB"],
    ["company_secrets.docx", "/home/user/confidential/", "docx", "100KB"],
    ["private_notes.csv", "C:\\Users\\Admin\\Downloads\\", "csv", "120KB"],
    ["id_verification_passport.jpg", "F:\\Documents\\Government_IDs\\", "jpg", "450KB"],
    ["server_logs.log", "/var/logs/", "log", "2MB"],
    ["user_manual.pdf", "C:\\Program Files\\App\\", "pdf", "1MB"],
    ["meeting_notes.docx", "C:\\Users\\Employee\\Documents\\", "docx", "600KB"],
    ["todo_list.txt", "/home/user/", "txt", "2KB"],
    ["grocery_list.csv", "D:\\Shopping\\", "csv", "5KB"],
    ["holiday_pictures.png", "E:\\Photos\\", "png", "700KB"],
    ["resume_john_doe.pdf", "/home/user/Documents/", "pdf", "400KB"],
    ["project_plan.xlsx", "C:\\Users\\ProjectManager\\Work\\", "xlsx", "800KB"],
    ["bank_details.docx", "D:\\Finance\\", "docx", "250KB"],
    ["internal_memo.pdf", "C:\\Company\\HR\\", "pdf", "300KB"],
    ["id_card_scan.jpg", "F:\\Security\\ID\\", "jpg", "500KB"],
    ["server_backup.key", "/etc/security/", "key", "4KB"],
    ["client_contract.pdf", "D:\\Legal\\", "pdf", "900KB"],
    ["biometric_data.csv", "C:\\Users\\Admin\\Bio\\", "csv", "350KB"],
    ["application_form.docx", "C:\\Users\\Applications\\", "docx", "200KB"],
    ["web_cache.log", "/var/www/cache/", "log", "5MB"],
    ["license_key.pem", "/home/user/licenses/", "pem", "3KB"],
    ["expense_report.xlsx", "D:\\Finance\\Reports\\", "xlsx", "1MB"],
    ["daily_journal.txt", "/home/user/docs/", "txt", "2KB"],
    ["employee_salaries.csv", "C:\\HR\\Payroll\\", "csv", "1.5MB"],
    ["tax_filing_2024.pdf", "D:\\Documents\\Tax\\", "pdf", "800KB"],
    ["customer_feedback.docx", "E:\\Customer_Service\\", "docx", "450KB"],
    ["medical_prescription.png", "F:\\Health\\", "png", "600KB"],
    ["passport_scan.pdf", "C:\\Users\\John\\Travel\\", "pdf", "750KB"],
    ["meeting_agenda.docx", "/home/office/meetings/", "docx", "200KB"],
    ["software_update.log", "/var/system/logs/", "log", "3MB"],
    ["driver_license.jpg", "D:\\Identity\\", "jpg", "300KB"],
    ["employee_reviews.docx", "C:\\HR\\Performance\\", "docx", "500KB"],
    ["confidential_budget.xlsx", "D:\\Finance\\", "xlsx", "2MB"],
    ["insurance_policy.pdf", "/home/user/insurance/", "pdf", "850KB"],
    ["server_config.key", "/etc/server/security/", "key", "1KB"],
    ["home_loan_agreement.docx", "D:\\Loans\\", "docx", "950KB"],
    ["personal_notes.txt", "C:\\Users\\John\\Desktop\\", "txt", "10KB"],
    ["business_proposal.pdf", "D:\\Work\\Proposals\\", "pdf", "1.1MB"],
    ["academic_records.csv", "/home/user/education/", "csv", "400KB"],
    ["photo_album.png", "E:\\Gallery\\", "png", "1MB"],
    ["script_backup.pem", "/home/user/scripts/", "pem", "2KB"],
    ["shipment_tracking.log", "/var/logs/shipping/", "log", "6MB"],
    ["network_diagram.jpg", "D:\\IT\\Networking\\", "jpg", "750KB"]
]

for data in file_data:
    filename, completefilepath, fileextension, filesize_str = data
    print("File Name:",filename,"Path:",completefilepath,is_sensitive_fuzzy(filename, completefilepath))



File Name: john_doe_aadhar.pdf Path: C:\Users\John\Documents\ID_Cards\ True
File Name: pancard_scan.png Path: /home/user/private/docs/ True
File Name: bank_statement_2024.pdf Path: D:\Financial\Statements\ True
File Name: passwords_backup.txt Path: C:\Users\Admin\Desktop\ True
File Name: secrets_env.key Path: /etc/ssl/private/ True
File Name: medical_record_jane.pdf Path: E:\Health\Records\ False
File Name: salary_slip_march.docx Path: D:\HR\Payroll\ True
File Name: company_secrets.docx Path: /home/user/confidential/ True
File Name: private_notes.csv Path: C:\Users\Admin\Downloads\ False
File Name: id_verification_passport.jpg Path: F:\Documents\Government_IDs\ True
File Name: server_logs.log Path: /var/logs/ True
File Name: user_manual.pdf Path: C:\Program Files\App\ False
File Name: meeting_notes.docx Path: C:\Users\Employee\Documents\ False
File Name: todo_list.txt Path: /home/user/ False
File Name: grocery_list.csv Path: D:\Shopping\ False
File Name: holiday_pictures.png Path: E:\P

In [None]:
import joblib

# Save the models
joblib.dump(logistic_model, "models/linear_model.pkl")
joblib.dump(rf_model, "models/rf_model.pkl")

# Save the feature list used for training
joblib.dump(list(training_columns), "models/training_columns.pkl")