In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
df_train = pd.read_csv("raw_data/train.csv")

## defining feature types

In [3]:
df_train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [4]:
categorical_features = ["HomePlanet","CryoSleep","Destination","VIP"]
numerical_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa","VRDeck"]

In [5]:
df_train["Transported"] = df_train["Transported"].astype(int)

## defining X, y and building Pipelines

In [6]:
X = df_train.drop(columns=["PassengerId","Transported", "Name", "Cabin"])
y = df_train['Transported']

In [7]:
num_pipeline = Pipeline(
    [
        ('num_imputer', SimpleImputer(strategy='median')), 
        ('num_pipeline', Normalizer())
    ]
)

In [8]:
cat_pipeline = Pipeline(
    [
        ('cat_imputer', SimpleImputer(strategy="most_frequent")),
        ('cat_pipeline', OneHotEncoder(sparse=False))
    ]
)

In [9]:
preprocessing = ColumnTransformer(
    [
        ('n_pipe', num_pipeline, numerical_features),
        ('c_pipe', cat_pipeline, categorical_features)
    ]
)

In [10]:
X_transformed = preprocessing.fit_transform(X)

In [11]:
#numerical_features.extend(preprocessing.transformers_[1][1].get_feature_names().tolist()) # RETREIVING A LIST OF COLUMNS

## training Logistic Regression, transforming 'test' dataset and getting prediction

In [12]:
model = LogisticRegression()
model.fit(X_transformed,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
test_file = pd.read_csv("raw_data/test.csv")
df_test = test_file.drop(columns= ["PassengerId","Name", "Cabin"])

In [14]:
test = preprocessing.transform(df_test)
prediction = model.predict(test)

In [15]:
test_df = pd.DataFrame(test_file["PassengerId"])
test_df["Transported"] = prediction.astype(bool)

In [16]:
#test_df.to_csv("raw_data/submission.csv", index=False)