# Importing Necessary Modules

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# Loading Data

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Preprocessing 

In [4]:
def Dropper(df):
    new_df = df.copy()
    if 'PassengerId' in new_df.columns:
        new_df.drop(columns= ['PassengerId'], inplace= True)

    return new_df


def Imputer(df):
    imputed_df = df.copy()
    for i in imputed_df.columns:
        if imputed_df[i].dtype == 'object':
            imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mode()[0])
        else:
            imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mean())

    return imputed_df


def Encoder(df):
    labeled_df = df.copy()
    encoder = LabelEncoder()
    for i in labeled_df.columns:
        if labeled_df[i].dtype in ['object', 'bool']:
            labeled_df[i] = encoder.fit_transform(labeled_df[i])

    return labeled_df


def Normalizer(df):
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns= df.columns)

    return scaled_df


def SNormalizer(df):
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns= df.columns)

    return scaled_df

In [5]:
X = df.drop(columns= ['Transported'])
Y = df['Transported']
dr = Dropper(X)
im = Imputer(dr)
en = Encoder(im)
nm = Normalizer(en)
nm

  imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mode()[0])


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0.5,0.0,0.022717,1.0,0.493671,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.619924
1,0.0,0.0,0.332978,1.0,0.303797,0.0,0.007608,0.000302,0.001064,0.024500,0.001823,0.531398
2,0.5,0.0,0.000152,1.0,0.734177,1.0,0.003001,0.119948,0.000000,0.299670,0.002030,0.053942
3,0.5,0.0,0.000152,1.0,0.417722,0.0,0.000000,0.043035,0.015793,0.148563,0.007997,0.843839
4,0.0,0.0,0.333283,1.0,0.202532,0.0,0.021149,0.002348,0.006428,0.025214,0.000083,0.981941
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.5,0.0,0.022259,0.0,0.518987,1.0,0.000000,0.228726,0.000000,0.073322,0.003066,0.415958
8689,0.0,1.0,0.805001,0.5,0.227848,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.564212
8690,0.0,0.0,0.805763,1.0,0.329114,0.0,0.000000,0.000000,0.079687,0.000045,0.000000,0.354344
8691,0.5,0.0,0.324897,0.0,0.405063,0.0,0.000000,0.035186,0.000000,0.015753,0.134049,0.188385


# Training Model

In [6]:
LR = LogisticRegression(random_state= 42, solver='liblinear', C=0.1, class_weight='balanced')
LR.fit(nm, Y)

In [7]:
# RFC = RandomForestClassifier(
#     n_estimators=200, 
#     max_depth=10, 
#     min_samples_split=5, 
#     min_samples_leaf=2, 
#     max_features="sqrt", 
#     bootstrap=True, 
#     random_state=42
# )
# RFC.fit(nm, Y)

# Calculating Accuracies

In [8]:
# train_pred = RFC.predict(nm)

In [9]:
train_pred = LR.predict(nm)

In [10]:
train_accu = accuracy_score(Y, train_pred)
train_accu

0.734844127458875

# Loading Test data

In [11]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [12]:
test_y = pd.read_csv('sample_submission.csv')
test_y

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [13]:
test_data = pd.merge(test_df, test_y, on= 'PassengerId')
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,False
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,False
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,False
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,False
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,False
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,False
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,False
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,False


# Preprocessing

In [14]:
test_X = test_data.drop(columns= 'Transported')
test_Y = test_data['Transported']

In [15]:
test_nm = Normalizer(Encoder((Imputer(Dropper(test_X)))))
test_nm

  imputed_df[i] = imputed_df[i].fillna(imputed_df[i].mode()[0])


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0.0,1.0,0.852941,1.0,0.341772,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.697485
1,0.0,0.0,0.571998,1.0,0.240506,0.0,0.000000,0.000356,0.00000,0.142260,0.000000,0.576287
2,0.5,1.0,0.078738,0.0,0.392405,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.808623
3,0.5,0.0,0.079350,1.0,0.481013,0.0,0.000000,0.263206,0.00000,0.009121,0.026266,0.649341
4,0.0,0.0,0.594363,1.0,0.253165,0.0,0.000865,0.000000,0.07658,0.000000,0.000000,0.160000
...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.0,1.0,0.820772,1.0,0.430380,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.504192
4273,0.0,0.0,0.824449,1.0,0.531646,0.0,0.000000,0.033514,0.00205,0.000504,0.006466,0.635689
4274,1.0,1.0,0.184436,0.0,0.362761,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.499401
4275,0.5,0.0,0.184743,1.0,0.362761,0.0,0.000000,0.106042,0.00000,0.000000,0.023482,0.557365


# Predicting on Test Data

In [16]:
# test_pred = RFC.predict(test_nm)

In [None]:
test_pred = LR.predict(test_nm)

0        True
1       False
2        True
3       False
4       False
        ...  
4272     True
4273    False
4274     True
4275    False
4276     True
Length: 4277, dtype: bool

# Calculating Accuracy

In [18]:
test_accu = accuracy_score(test_Y, test_pred)
test_accu

0.6104746317512275

In [27]:
data = pd.DataFrame({'PassengerId': test_X['PassengerId'], 'Transported': pd.Series(test_pred)})
data

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [28]:
with open('sample_test.csv', 'w', newline='') as file:
    data.to_csv(file, index= False)