In [49]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [50]:
df_train = pd.read_csv("raw_data/train.csv")

In [51]:
df_train = df_train.drop(columns= ["Name", "Cabin"])
df_train["amenities"] = df_train['RoomService']+ df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']

In [52]:
df_train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Transported        2
amenities       2116
dtype: int64

In [53]:
df_train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'amenities'],
      dtype='object')

In [54]:
categorical_features = ["HomePlanet","CryoSleep","Destination","VIP"]
numerical_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa","VRDeck"]

In [55]:
df_train["Transported"] = df_train["Transported"].astype(int)

In [56]:
X = df_train.drop(columns=["PassengerId","Transported", "amenities"])
y = df_train['Transported']

In [57]:
num_pipeline = Pipeline(
    [
        ('num_imputer', SimpleImputer(strategy='median')), 
        ('num_pipeline', Normalizer())
    ]
)

In [58]:
cat_pipeline = Pipeline(
    [
        ('cat_imputer', SimpleImputer(strategy="most_frequent")),
        ('cat_pipeline', OneHotEncoder(sparse=False))
    ]
)

In [59]:
preprocessing = ColumnTransformer(
    [
        ('n_pipe', num_pipeline, numerical_features),
        ('c_pipe', cat_pipeline, categorical_features)
    ]
)

In [60]:
X_transformed = preprocessing.fit_transform(X)

In [61]:
#numerical_features.extend(preprocessing.transformers_[1][1].get_feature_names().tolist()) # RETREIVING A LIST OF COLUMNS

In [62]:
model = LogisticRegression()
model.fit(X_transformed,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
test_file = pd.read_csv("raw_data/test.csv")
df_test = test_file.drop(columns= ["PassengerId","Name", "Cabin"])

In [64]:
test = preprocessing.transform(df_test)
prediction = model.predict(test)

In [65]:
test_df = pd.DataFrame(test_file["PassengerId"])
test_df["Transported"] = prediction.astype(bool)

In [66]:
#test_df.to_csv("raw_data/submission.csv", index=False)