# Introduction

In this notebook we test different ML methods' success rates in predicting performance in airport scanner.

In [None]:
# Install required libraries for enabling google sheets
# !pip install gspread google-auth google-auth-oauthlib google-auth-httplib2



In [58]:
# Set Up
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# Authenticating gspread
from google.colab import auth
auth.authenticate_user()

import gspread
from google.oauth2 import service_account
from google.auth.transport.requests import Request
import google.auth

# Get credentials and create a gspread client
creds, project = google.auth.default()
creds = creds.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
gc = gspread.authorize(creds)

# Mount google drive (for data)
from google.colab import drive
drive.mount('/content/drive')


# Preprocessing and Feature Engineering

In [54]:
# load google sheet with feature inclusion values and other info
worksheet = gc.open('feature_info').sheet1
feature_info = pd.DataFrame(worksheet.get_all_records())

# Load your data from google drive
raw_df = pd.read_csv('/content/drive/My Drive/ASDB/fixed_ASDB_1%_Sample.csv')

# Check that column names match
feature_columns = feature_info['ColumnName'].tolist()
columns_dataset = raw_df.columns.tolist()

missing_columns = set(columns_dataset) - set(feature_columns)
extra_columns = set(feature_columns) - set(columns_dataset)

print("Missing columns in feature_info: ", missing_columns)
print("Extra columns in feature_info: ", extra_columns)

Missing columns in feature_info:  set()
Extra columns in feature_info:  set()


In [55]:
# Filter only included columns
feature_info = feature_info[feature_info['include'] == 1]

# Create a dictionary to group columns based on feature type
feature_groups = {}
for feature_type in feature_info['feature type'].unique():
    feature_groups[feature_type] = feature_info.loc[feature_info['feature type'] == feature_type, 'ColumnName'].tolist()

# Show feature info
feature_info

Unnamed: 0,ColumnName,include,feature type
1,UserId,1,identity
2,BagId,1,identity
9,RankId,1,level
11,AirportId,1,level
12,Passengers,1,sequence
...,...,...,...
203,Legal20Marked,1,performance
204,Legal20X,1,trial feature
205,Legal20Y,1,trial feature
206,Legal20Z,1,trial feature


In [59]:
# Subset the data based on included columns
included_columns = []
for feature_type in feature_groups:
    included_columns.extend(feature_groups[feature_type])

df = raw_df[included_columns]

# 1. Summary statistics
print(df.describe())

# Distribution visualization for all numeric columns
for column in df.select_dtypes(include=['float64', 'int64']):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], bins=50)
    plt.title(f'Distribution of {column}')
    plt.show()

# 2. Correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 3. Checking missing values
missing_data = df.isnull().sum()
print("Missing values for each column:")
print(missing_data)

Output hidden; open in https://colab.research.google.com to view.