In [1]:
from pymongo import MongoClient
import pandas as pd

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['your_database']  # Replace with your database name
collection = db['your_collection']  # Replace with your collection name


In [2]:
# Retrieve all documents from the collection
documents = collection.find()

# Convert the documents into a pandas DataFrame
test = pd.DataFrame(list(documents))

# Optionally, drop the '_id' column if you don't need it
if '_id' in test.columns:
    test.drop('_id', axis=1, inplace=True)


In [3]:
# Define the new column names
new_column_names = [
    'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
    'Class', 'Flight Distance', 'Inflight wifi service',
    'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service',
    'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
    'satisfaction'
]

# Rename the columns of the DataFrame
test.columns = new_column_names


In [4]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [5]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [6]:
# drop the first column
train.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('Unnamed: 0', axis=1, inplace=True)
train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [7]:
# compare the columns of the two DataFrames
print('Columns in train: ', train.columns)
print('Columns in test: ', test.columns)


Columns in train:  Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')
Columns in test:  Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanli

In [8]:
# check the shape of the two DataFrames
print('Shape of train: ', train.shape)
print('Shape of test: ', test.shape)



Shape of train:  (103904, 24)
Shape of test:  (25976, 24)


In [9]:
# data preprocessing:
# 1. check for missing values

# check for missing values in the train DataFrame
missing_values_train = train.isnull().sum()
print('Missing values in train: ')
print(missing_values_train[missing_values_train > 0])

# check for missing values in the test DataFrame
missing_values_test = test.isnull().sum()
print('Missing values in test: ')
print(missing_values_test[missing_values_test > 0])

# 2. check for duplicates
# check for duplicates in the train DataFrame
duplicates_train = train.duplicated().sum()
print('Number of duplicates in train: ', duplicates_train)

# check for duplicates in the test DataFrame
duplicates_test = test.duplicated().sum()
print('Number of duplicates in test: ', duplicates_test)

# 3. check for unique values
# check for unique values in the train DataFrame
unique_values_train = train.nunique()
print('Unique values in train: ')
print(unique_values_train)

# check for unique values in the test DataFrame
unique_values_test = test.nunique()
print('Unique values in test: ')
print(unique_values_test)

# 4. check for data types
# check the data types of the train DataFrame
data_types_train = train.dtypes
print('Data types in train: ')
print(data_types_train)



Missing values in train: 
Arrival Delay in Minutes    310
dtype: int64
Missing values in test: 
Arrival Delay in Minutes    83
dtype: int64
Number of duplicates in train:  0
Number of duplicates in test:  0
Unique values in train: 
id                                   103904
Gender                                    2
Customer Type                             2
Age                                      75
Type of Travel                            2
Class                                     3
Flight Distance                        3802
Inflight wifi service                     6
Departure/Arrival time convenient         6
Ease of Online booking                    6
Gate location                             6
Food and drink                            6
Online boarding                           6
Seat comfort                              6
Inflight entertainment                    6
On-board service                          6
Leg room service                          6
Baggage handling    

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Assuming 'train' and 'test' are your pandas DataFrames

# 5. Handle Missing Values
# Impute missing values with the median for 'Arrival Delay in Minutes'
imputer = SimpleImputer(strategy='median')
train['Arrival Delay in Minutes'] = imputer.fit_transform(train[['Arrival Delay in Minutes']])
test['Arrival Delay in Minutes'] = imputer.transform(test[['Arrival Delay in Minutes']])

# 6. Convert Categorical Variables into One-Hot Encoding
categorical_vars = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first') # drop='first' to avoid dummy variable trap

# Fit_transform on train, transform on test
train_encoded = onehot_encoder.fit_transform(train[categorical_vars])
test_encoded = onehot_encoder.transform(test[categorical_vars])

# Convert encoded data into DataFrame
train_encoded_df = pd.DataFrame(train_encoded, columns=onehot_encoder.get_feature_names_out(categorical_vars))
test_encoded_df = pd.DataFrame(test_encoded, columns=onehot_encoder.get_feature_names_out(categorical_vars))

# Drop original categorical columns and concatenate the new one-hot encoded columns
train = train.drop(categorical_vars, axis=1).reset_index(drop=True)
test = test.drop(categorical_vars, axis=1).reset_index(drop=True)
train = pd.concat([train, train_encoded_df], axis=1)
test = pd.concat([test, test_encoded_df], axis=1)
train.head()


Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,70172,13,460,3,4,3,1,5,3,5,...,5,5,25,18.0,1.0,0.0,1.0,0.0,1.0,0.0
1,5047,25,235,3,2,3,3,1,3,1,...,4,1,1,6.0,1.0,1.0,0.0,0.0,0.0,0.0
2,110028,26,1142,2,2,2,2,5,5,5,...,4,5,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,24026,25,562,2,5,5,5,2,2,2,...,4,2,11,9.0,0.0,0.0,0.0,0.0,0.0,0.0
4,119299,61,214,3,3,3,3,4,5,5,...,3,3,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [11]:
train.columns

Index(['id', 'Age', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'Gender_Male', 'Customer Type_disloyal Customer',
       'Type of Travel_Personal Travel', 'Class_Eco', 'Class_Eco Plus',
       'satisfaction_satisfied'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'train' and 'test' are your pandas DataFrames after preprocessing steps 1 to 6

# Normalize Numerical Features (excluding 'id' and one-hot encoded features)
numerical_vars = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
scaler = StandardScaler()

# Apply the transformation to both train and test sets
train[numerical_vars] = scaler.fit_transform(train[numerical_vars])
test[numerical_vars] = scaler.transform(test[numerical_vars])

# Split the Data - Adjusting for the actual target variable name
X = train.drop(['id', 'satisfaction_satisfied'], axis=1)  # Drop 'id' and the target variable
y = train['satisfaction_satisfied']  # Correct target variable

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

# Now, the test set should also exclude 'id' but will not have the target variable 'satisfaction_satisfied' to drop
test = test.drop(['id'], axis=1)  # Only drop 'id' as the test set doesn't include the target variable



In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 2. Create the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# 3. Train the model on the training set
knn.fit(X_train, y_train)

# 4. Predict on the validation set
y_pred = knn.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.9271


In [14]:
# save the model to disk
import pickle
model_filename = 'knn_model.pkl'
pickle.dump(knn, open(model_filename, 'wb'))

# save the test set to disk
test_filename = 'test_normalized.csv'
test.to_csv(test_filename, index=False)

In [15]:
# convert the test set to a JSON object
test_json = test.to_json(orient='records')

# save the JSON object to disk
test_json_filename = 'test_normalized.json'
with open(test_json_filename, 'w') as file:
    file.write(test_json)

    

In [16]:
test_pred = test.drop(['satisfaction_satisfied'], axis=1)
test_pred_json = test_pred.to_json(orient='records')

# save the JSON object to disk
test_pred_json_filename = 'test_pred.json'
with open(test_pred_json_filename, 'w') as file:
    file.write(test_pred_json)

In [17]:
test.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,0.834958,-1.032398,5,4,3,4,3,4,3,5,...,5,5,0.920317,0.74688,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.223601,1.678348,1,1,3,1,5,4,5,4,...,4,5,-0.387532,-0.391554,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.28216,-1.000307,2,0,2,4,2,2,2,2,...,2,2,-0.387532,-0.391554,1.0,1.0,0.0,1.0,0.0,0.0
3,0.305678,2.193821,0,0,0,2,3,4,4,1,...,1,4,-0.387532,-0.236313,1.0,0.0,0.0,0.0,0.0,1.0
4,0.636478,-0.00747,2,3,4,3,4,1,2,2,...,2,4,-0.387532,0.125916,0.0,0.0,0.0,1.0,0.0,1.0
