In [None]:
import pandas as pd
df = pd.read_csv("titanic_train.csv")       
df


In [None]:
# drop the columns that are outright not useful
df.drop(labels = ['PassengerId',
        'Name',
        'Ticket',
        'Fare',
        'Cabin'], 
        axis = 1, 
        inplace = True)

# check to see if the columns are removed
df


In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)                   # drop all rows with NaN
df.reset_index(inplace=True, drop=True)   # re-index the dataframe
df


In [None]:
from sklearn import preprocessing

# initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# convert Sex and Embarked features to numeric
df["Sex"] = label_encoder.fit_transform(df["Sex"])
# 0 = female
# 1 = male

df["Embarked"] = label_encoder.fit_transform(df["Embarked"])
# 0 = C
# 1 = Q
# 2 = S

df


In [None]:
df['Alone'] = [1 if size==0 else 0 for size in (df['Parch'] + df['SibSp'])]
df


In [None]:
# drop the columns that are not useful to us
df.drop(columns=['SibSp','Parch'], inplace=True)

# check to see if the colummns are removed
df


In [None]:
from scipy.stats import chi2_contingency

results = []
for column in ['Pclass','Sex','Embarked','Alone']:
    # create a contingency table
    contingency_table = pd.crosstab(df[column], df['Survived'])

    # perform chi-square test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    results.append([column,chi2,p])
    
pd.DataFrame(results, columns = ['column','chi2','p'])


In [None]:
df[['Age','Survived']].corr(
    method='pearson')['Survived'].abs().sort_values(ascending=False)


In [None]:
# make fields categorical
df["Sex"]      = pd.Categorical(df["Sex"])
df["Embarked"] = pd.Categorical(df["Embarked"])
df["Survived"] = pd.Categorical(df["Survived"])
df["Pclass"]   = pd.Categorical(df["Pclass"])
df["Alone"]    = pd.Categorical(df["Alone"])

print(df.dtypes)     # examine the datatypes for each feature


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,1:],             # features (all except Survived)
    df.iloc[:,0],              # label (Survived)     
    test_size = 0.20,          # split ratio
    random_state = 1,          # set random seed
    stratify = df.iloc[:,0])   # randomize based on labels


In [None]:
from sklearn import linear_model

# initialize logistic regression model
log_regress = linear_model.LogisticRegression()

# train the model
log_regress.fit(X = X_train,
                y = y_train)


In [None]:
from sklearn import metrics
y_pred = log_regress.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# parameter grid
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}


In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
clf = GridSearchCV(logreg,                      # model
                   param_grid = parameters,     # hyperparameters
                   scoring = 'accuracy',        # metric for scoring
                   cv = 10)                     # number of folds


In [None]:
clf.fit(X_train,y_train)

In [None]:
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy of training set:", clf.best_score_)
print("Accuracy of testing set:", clf.score(X_test, y_test))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# define a dictionary of classifiers and their corresponding parameter grids
classifiers = {
    'LogisticRegression': (
        LogisticRegression(), 
        {
            'penalty' : ['l1','l2'], 
            'C'       : np.logspace(-3,3,7),
            'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
        }
    ),    
    'SVM': (
        SVC(), 
        {
            'kernel' : ('linear', 'rbf'), 
            'C':[1, 10]
        }
    ),    
    'KNeighbors': (
        KNeighborsClassifier(), {
            'n_neighbors': [3, 5, 7], 
            'weights' : ['uniform', 'distance']
        }
    ),    
    'RandomForest': (
        RandomForestClassifier(), {
            'n_estimators': [10, 50, 100]
        }
    ),    
    'GradientBoosting': (
        GradientBoostingClassifier(), {
            'learning_rate': [0.1, 0.05, 0.01], 
            'n_estimators': [100, 200, 300]
        }
    ), 
    'NaiveBayes': (
        GaussianNB(), {}
    ),
    'DecisionTreeClassifier':(
        DecisionTreeClassifier(),
        {
            'criterion': ['gini', 'entropy'], 
            'max_depth': [2, 3, 4, 5]
        }
    )
}


In [None]:
results = []

# evaluate each classifier using GridSearchCV
for clf_name, (clf, param_grid) in classifiers.items():
    grid_search = GridSearchCV(clf, param_grid, cv = 5)
    grid_search.fit(X_train, y_train)    
    training_accuracy = grid_search.best_score_    
    testing_accuracy = grid_search.score(X_test, y_test)    
    results.append([clf_name,training_accuracy,testing_accuracy, param_grid])
   
df_result = pd.DataFrame(results, columns = ["Algorithm",
                                             "Training Set Accuracy",
                                             "Testing Set Accuracy",
                                             "Parameters"])
display(df_result)


In [None]:
df_result.sort_values(by='Testing Set Accuracy', ascending = False)

In [None]:
X = df.iloc[:,1:]   # features (all except Survived)
y = df.iloc[:,0]    # Survived
log_regress = linear_model.LogisticRegression(C = 0.1,
                                              penalty = 'l2',
                                              solver= 'newton-cg')
log_regress.fit(X,y)


In [None]:
from sklearn.model_selection import cross_val_score

# use cross-validation to score the model
cross_val_score(log_regress, X, y, cv = 10).mean()


In [None]:
!pip install gradio

In [None]:
def make_prediction(pclass,sex,age,embarked,alone):
    return 'Alive' if log_regress.predict([
        [int(pclass),int(sex),float(age),int(embarked),int(alone)]])[0] \
        else 'Dead'


In [None]:
import gradio as gr
pclass = gr.Radio([1,2,3], 
                  type = "value", 
                  label = "Passenger Class", 
                  value = 1)
sex = gr.Radio(['Female','Male'], 
               type = "index", 
               label = "Sex", 
               value = "Female")      # 0 = female, 1 = male
age = gr.Slider(1,100, 
                label = "Age",
                value = 20)
embarked = gr.Radio(['Cherbourg','Queenstown','Southampton'], 
                    type = "index", 
                    label = "Embarked", 
                    value = "Cherbourg") # 0 = C, 1 = Q, 2 = S
alone = gr.Radio(['No','Yes'], 
                 type = "index", 
                 label = "Alone", value = "No") # 0 – no, 1 - yes


In [None]:
app =  gr.Interface(
    fn = make_prediction,                         # the function to bind to
    title = 'Prediction',                         # title of the interface
    inputs = [pclass, sex, age, embarked, alone], # type of input(s)
    outputs = 'text')                             # type of output(s)
app.launch()
