# Competition

<img src="https://raw.githubusercontent.com/afocoelho/titanic_competition/main/img/titanic-sinks-the-lobsters-in-the-kitchen.png"  >

## Titanic 

## Schedule
 - Treat missing values (15 min)
 - Exploratory Data Analysis ( 30 min ) ( generate between 1 and 3 graphs that you think are important ) 
 - Feature engineering ( 30 min ) 
 - Modeling (45 min)
 - Extra (15 min to prepare presentation)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics, preprocessing
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.
%matplotlib inline

#### 1.1. Read in data:

The data and explanation can be found [here](https://www.kaggle.com/c/titanic/data) (requires sign in).

In [None]:
!wget --no-clobber https://raw.githubusercontent.com/afocoelho/titanic_competition/main/data/data_titanic.csv

In [None]:
df = pd.read_csv('data_titanic.csv', header='infer')

In [None]:
df.shape

In [None]:
df.head(3)

#### 1.2. Missing value processing: 

In [None]:
# Check for the missing values.
df.isnull().sum(axis=0)

In [None]:
np.random.uniform(low=30, high=60)

In [None]:
30+np.random.rand()*30

In [None]:
# Fill the missing values in the Age variable.
n = df.shape[0]
Age = []                                                               # A temporary list.
for i in range(n):
    if np.isnan(df.Age[i]):
        if ('Mr' in df.Name[i]) or ('Mrs' in df.Name[i]) :
            Age.append(np.random.uniform(low=30, high=60))                                             # If Mr. or Mrs. in the name, then fill with 30.
        else:
            Age.append(np.random.uniform(low=5, high=20))                                             # Likely a child. So, fill with 10.
    else:
        Age.append(df.Age[i])
df.Age = pd.Series(Age)

In [None]:
# drop the columns you dont want to use now. Please do not consider treating all the columns you dont have time for that


In [None]:
# verify missing values and delete or correct the rest of missing values

#### 1.3. Exploratory data analysis:

In [None]:
# Do some amazing graphs
sns.barplot(x='???', y='???', ci=None,  data=df)
plt.show()

#### 1.4. Feature engineering:

In [None]:
# One hot enconding -  Convert into dummy variables and then remove the original variables.
# example for pclass
df = pd.get_dummies(df.Pclass, drop_first=True,prefix='Pclass').join(df.drop(columns=['Pclass']))


In [None]:
## Save to an external file.
#df.to_csv('data_titanic_2.csv',index=False)

In [None]:
# Save your treated data just in case
df.to_csv('treated_titanic_data.csv',index=False)

#### 1.5. Create a train and test set:

In [None]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
## Use a simple algorithm as a baseline WITHOUT any grid search

In [None]:
## Evaluate
metrics.accuracy_score(y_test,y_pred)

In [None]:
## Try a new algorithm that you think it is suitable for the use case and conduct a grid search

In [None]:
accs = []
k_grid = range(1,100,1)
for k in k_grid:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    accs.append(metrics.accuracy_score(Y_test,Y_pred))

In [None]:
# define your  grid.
parameter_you_want_to_try = np.arange(1,51,1) # range of values that you want to try
parameters = {'n_neighbors':k_grid}

In [None]:
# define your  grid.
# Use small grid you dont have a lot of time
parameter_you_want_to_try = np.arange(1,51,1) # range of values that you want to try
parameters = {'n_neighbors':k_grid}

# Optimize the parameters.
gridCV = GridSearchCV("<< put the algorithm object here>> ", parameters, cv=5, n_jobs = -1)  # use your cv carefully you dont have a lot of time  ##scoring='accuracy')   # "n_jobs = -1" means "use all the CPU cores".
gridCV.fit(X_train, Y_train)
best_k = gridCV.best_params_['n_neighbors']


# Get the best para meters
print("Best k : " + str(best_k))

In [None]:
## train the best algorithm with the best parameters and do any extra analysis you think it might be important

In [None]:
## Evaluate your best algorithm
metrics.accuracy_score(y_test,y_pred)

<img src="https://raw.githubusercontent.com/afocoelho/titanic_competition/main/img/titanic-sinks-the-lobsters-in-the-kitchen.png"  >