# Hi! This is my first notebook
## And also my first kaggle.. and "Machine Learning" project 😱😨🙀</br>

## Let's start 😵

In [None]:
# import modules
import pandas as pd
import numpy as np

### Load train and test data

In [None]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_id = test['PassengerId']

train.head()

### Let's look at some info of the data and it's correlation first

In [None]:
# Check data first
train.info()
test.info()

In both tables, **Age**, **Cabin** are having null values.</br>
**Fare** has null only in test data and **Embarked** has null only in train data.

In [None]:
# check correlation
corr = train.corr()
corr.style.background_gradient(cmap='RdYlGn')

In [None]:
corr = train.corr(method="kendall")
corr.style.background_gradient(cmap='RdYlGn')

Next step is **dropping** unused column for the models.</br>I have decided to drop **'PassengerID'**, **'Name'**, **'Ticket and **'Cabin'** columns</br>
After that we fill **Age**'s null values with median and **Embarked**'s with 'Unknown'

Here i've made reusable code for dropping column and removing null with mean, median, or any string we want...

In [None]:
# create reusable function for column dropping and null value replacing

def drop_cols(data, drop_cols): #column-dropping function
    data = data.drop(drop_cols, axis=1)
    return data

def na_median (data, na_cols): #change null to median
    for col in na_cols:
        data[col].fillna(data[col].median(), inplace=True)
    return data

def na_mean (data, na_cols): #change null to mean
    for col in na_cols:
        data[col].fillna(data[col].mean(), inplace=True)
    return data

def na_string (data, na_cols, fillwith): #change null to any string
    for col in na_cols:
        data[col].fillna('{}'.format(fillwith), inplace=True)
    return data

In [None]:
# use self-built function to clean 😀
        

train = drop_cols(train, ['PassengerId', 'Name', 'Ticket', 'Cabin'])
train = na_median(train, ['Age'])
train = na_string(train, ['Embarked'], 'U' )

test = drop_cols(test, ['PassengerId', 'Name', 'Ticket', 'Cabin'])
test = na_median(test, ['Age'])
test = na_mean(test, ['Fare'])
    

#### check if it's solved

In [None]:
train.info()
test.info()

🙄

### Change 'Sex' and 'Embarked' columns to numerical value
#### as Sex and Embarked columns datatype is object, we should change them so that it'll fit to the "machine learning model" 🤠

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
columns = ["Sex", "Embarked"]

for col in columns:
    
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    print(le.classes_)
    
train.head(5)

**'Sex'** and **'Embarked'** has changed to int datatype from objects<br/><br/>
New numbers of them are assigned to 0-started index alphabetically (0 is **F**emale, 1 is **M**ale;
0 is **C**, 1 is 2 is **Q** and so on)

Now, lets split our training datasets for the model building. Also before that,<br/>we must define first our "**x-and-y**" (independent and dependent variables)

In [None]:
from sklearn.model_selection import train_test_split

y = train['Survived'] # assign survive column as models decide it
x = train.drop("Survived", axis=1) # assign the rest as the predictors

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

Then, import classifiers from **sklearn**🤔 to build then train our models💃 of "Machine Learning" 😱🤖

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier # you may need to install this classifier first
# run in the code cell "pip install xgboost" and restart the kernel, voila!

Then import those below to score our classifiers. Accuracy is accuracy 😬 and cross_val_score is cross validation score 😬😬😬

Well, accuracy measure how accurate your model predict the y-value
for example</br>
y_predict = [0,1,1,0]</br>
y_actual  = [0,0,1,1]</br>
Then your model's accuracy is 0.5. (first and third of them are matches)

For cross validation.... Google may explain better for you I guess 😬😬😬

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

After that, the show goes on 🥳

Train our model and let's see which model produce best cross validation score 👀

#### Logistic Regression

In [None]:
clf_lr = LogisticRegression(random_state=0, max_iter=100).fit(x_train, y_train)
cv_lr = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)
y_val_pred = clf_lr.predict(x_val)
acc_lr = accuracy_score(y_val, y_val_pred)

print(cv_lr)
print(cv_lr.mean())
print(acc_lr)

#### Decision Tree

In [None]:
clf_tree = tree.DecisionTreeClassifier(random_state=42).fit(x_train, y_train)
cv_tree = cross_val_score(tree.DecisionTreeClassifier(), x_train, y_train, cv=5)
y_val_tree = clf_tree.predict(x_val)
acc_tree = accuracy_score(y_val, y_val_tree)


print(cv_tree)
print(cv_tree.mean())
print(acc_tree)

#### K-Nearest Neighbors

In [None]:
clf_knn = KNeighborsClassifier().fit(x_train, y_train)
cv_knn = cross_val_score(KNeighborsClassifier(), x_train, y_train, cv=5)
y_val_knn = clf_knn.predict(x_val)
acc_knn = accuracy_score(y_val, y_val_knn)

print(cv_knn)
print(cv_knn.mean())
print(acc_knn)

#### Random Forest

In [None]:
clf_rdf = RandomForestClassifier(random_state = 1).fit(x_train, y_train)
cv_rdf = cross_val_score(RandomForestClassifier(random_state = 1), x_train, y_train, cv=5)
y_val_rdf = clf_rdf.predict(x_val)
acc_rdf = accuracy_score(y_val, y_val_rdf)

print(cv_rdf)
print(cv_rdf.mean())
print(acc_rdf)

#### Extreme Gradient Boosting

In [None]:
clf_xgb = XGBClassifier(random_state =1).fit(x_train, y_train)
cv_xgb = cross_val_score(XGBClassifier(random_state =1), x_train, y_train, cv=5)
y_val_xgb = clf_xgb.predict(x_val)
acc_xgb = accuracy_score(y_val, y_val_xgb)

print(cv_xgb)
print(cv_xgb.mean())
print(acc_xgb)

#### Support Vector Classifier

In [None]:
clf_svc = SVC().fit(x_train, y_train)
cv_svc = cross_val_score(SVC(), x_train, y_train, cv=5)
y_val_svc = clf_svc.predict(x_val)
acc_svc = accuracy_score(y_val, y_val_svc)

print(cv_svc)
print(cv_svc.mean())
print(acc_svc)

### Suddenly i feel like I want to apply my bitesize knowledge about 'Classes and Object' soo...
#### First, I made it to store every result I made
#### Then I realize that all of process in the cell above can be made here. So it's possible that all down below do not a "storage" anymore. Instead, it act as the "factory" 😓
As you can see, the *cvsscore* and *accuracy* function inside class is not parsed, but it's calculating...

In [None]:
class MLclfs:
    def __init__(self):
        pass
    
    def set(self, cvstype, clfstype, accuracy, modelname, newclfs):
        self.cvs = cvstype
        self.clfs = clfstype
        self.acc = accuracy
        self.name = modelname
        self.clfs_new = newclfs
        
    def cvsmean(self):
        return self.cvs.mean()
        
    def cvsscore(self):
        return cross_val_score(self.clfs_new, x_train, y_train, cv=5)
    
    def y_result(self, testdata):
        return self.clfs_new.fit(x_train, y_train).predict(testdata)
    
    def accuracy(self):
        return accuracy_score(y_val, self.clfs_new.fit(x_train, y_train).predict(x_val))

lr, dtree, knn, rdf, xgb, suppvc = [MLclfs() for i in range(6)]

name = [lr,dtree,knn,rdf,xgb,suppvc]
cvs = [cv_lr, cv_tree, cv_knn, cv_rdf, cv_xgb, cv_svc]
clfs = [clf_lr, clf_tree, clf_knn, clf_rdf, clf_xgb,clf_svc]
acc = [acc_lr, acc_tree, acc_knn, acc_rdf, acc_xgb, acc_svc]
modelname = ["Logistic Regression", "Decision Tree", "K-Nearest Neighbor",
             "Random Forest", "Extreme Gradient Boost", "SVC"]
clfs_new = [LogisticRegression(random_state = 1, max_iter = 1000), tree.DecisionTreeClassifier(), KNeighborsClassifier(),
            RandomForestClassifier(random_state = 1), XGBClassifier(random_state =1), SVC()]

for i in range(len(name)):
    name[i].set(cvs[i], clfs[i], acc[i], modelname[i], clfs_new[i])

mean_array = []   
def get_all_mean():
    print("\nMean :")
    for i in range(len(name)):
        mean_result = print("{} = ".format(modelname[i]), name[i].cvsmean())
        mean_array.append(name[i].cvsmean())
    return mean_result, mean_array


def get_all_score():
    print("\nScore of 5 Cross Validation :")
    for i in range(len(name)):
        score_result = print("{} = ".format(modelname[i]), name[i].cvsscore())
    return score_result

def get_all_accuracy():
    print("\nAccuracy :")
    for i in range(len(name)):
        acc_result = print("{} = ".format(modelname[i]), name[i].accuracy())
    return acc_result

def max_mean():
    return print("\nMax mean = ", max(mean_array))

# lr.cvsmean
# get_all_mean()
# get_all_accuracy()
# get_all_score()
# max_mean()
# dtree.y_result(x_train)

### Should we try on normalized/standardized "Fare" and "Age" ?.....
#### *should've create the training of models as a function before 😪

In [None]:
train.plot(kind='box', figsize=(20,6))

In [None]:


#train['NormalizedFare'] = (train.Fare - train.Fare.min()) / (train.Fare.max()- train.Fare.min())
#train.head()


### OK but maybe just later for the next "version" 🌈
Later some improvement can be applied such as **train with normalized/standardized Fare and Age.**</br>
Or **adding another independent variables** such as **title of the passenger** (with some regex🥴),</br>
or something we can determine from the **Ticket** and **Cabin** maybe...

Just like Mark Twain says, it really got me like 🙂

<img style="float: left;" src="https://user-images.githubusercontent.com/111634631/188610723-dfb26c25-1e9c-42d0-a672-027b700b67ce.jpeg">

So, until next time!

I wanna try submitting to Kaggle with **Random Forest** for the classifier, let's see how bad the mark I'll get😬😩

In [None]:
test_preds = rdf.y_result(test)

In [None]:
df = pd.DataFrame({'PassengerId': test_id.values,
                 'Survived': test_preds})

In [None]:
df.to_csv("Kagglesubmission.csv", index=False)