In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
##show logs about the dataset, such as nan values
show_data_info = False

In [3]:
##remove useless data columns like passengerid, name, cabin (just for semplicty) and ticket number
def remove_extra_features(data):
    return data.drop(["PassengerId", "Name", "Cabin", "Ticket"], axis=1)

def map_data(data):
    ##map every string in the dataset, we can only have numbers
    sex_class_mapping = {label: idx for idx, label in enumerate(np.unique(data["Sex"]))}
    ##the final astype ensures the object is treated like a string
    embarked_class_mapping = {label: idx for idx, label in enumerate(np.unique(data["Embarked"].astype(str)))}
    data["Sex"] = data["Sex"].map(sex_class_mapping)
    data["Embarked"] = data["Embarked"].map(embarked_class_mapping)
    data['Fare'] = data['Fare'].astype(int)
    return data

def manage_nan_values(data):
    ##manage NaN values
    imr = SimpleImputer(missing_values=np.nan, strategy='mean')
    imr = imr.fit(data)
    return pd.DataFrame(imr.transform(data.values))

def plot_data(x, y, title):
    #perform PCA
    pca = PCA(n_components=2)
    pca.fit(x)  
    x_pca = pca.transform(x)
    #plot PCA
    colors = ["r", "b", "g"]
    markers = ["s", "x", "o"]
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(x_pca[y==l, 0], x_pca[y==l, 1], c=c, marker=m, label=l)
    plt.title(title)
    plt.show()

In [74]:
data = pd.read_csv("data/train.csv") 

y = data.Survived
X = data.drop("Survived", axis=1)
X = remove_extra_features(X)
labels = X.columns[1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(X_train.head())

extra_dataframe = pd.DataFrame(columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
extra_dataframe = extra_dataframe.append(pd.Series([3, "female", 92.0, 0, 0, 1.0, "S"], index=extra_dataframe.columns ), ignore_index=True)
extra_dataframe = extra_dataframe.append(pd.Series([3, "male", 22.0, 0, 0, 12.0, "C"], index=extra_dataframe.columns ), ignore_index=True)
extra_dataframe = extra_dataframe.append(pd.Series([3, "female", 47.0, 1, 0, 4.0, "C"], index=extra_dataframe.columns ), ignore_index=True)
extra_dataframe = extra_dataframe.append(pd.Series([2, "female", 21.0, 1, 0, 17.0, "C"], index=extra_dataframe.columns ), ignore_index=True)
extra_dataframe = extra_dataframe.append(pd.Series([2, "male", 29.0, 1, 0, 17.0, "C"], index=extra_dataframe.columns ), ignore_index=True)
extra_dataframe = extra_dataframe.append(pd.Series([3, "female", 22.0, 0, 0, 7.0, "C"], index=extra_dataframe.columns ), ignore_index=True)


if show_data_info:
    ##show percentage of missing data that can mislead our prediction
    print("Percentage of missing data in the train_set\r\n ", X_train.isnull().sum() * 100 / X_train.shape[0])
    print("Percentage of missing data in the test_set\r\n ", X_test.isnull().sum() * 100 / X_test.shape[0])

##map string values in the dataset
X_train = map_data(X_train)
X_test = map_data(X_test)
extra_dataframe = map_data(extra_dataframe)

print(extra_dataframe)
##manage NaN values
X_train = manage_nan_values(X_train)
X_test = manage_nan_values(X_test)
extra_dataframe = manage_nan_values(extra_dataframe)


print(extra_dataframe)

##scale data
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))
extra_dataframe = pd.DataFrame(scaler.transform(extra_dataframe))


print(extra_dataframe)

#plot_data(X_train, y_train, "train data")
#plot_data(X_test, y_test, "test data")

     Pclass     Sex   Age  SibSp  Parch     Fare Embarked
331       1    male  45.5      0      0  28.5000        S
733       2    male  23.0      0      0  13.0000        S
382       3    male  32.0      0      0   7.9250        S
704       3    male  26.0      1      0   7.8542        S
813       3  female   6.0      4      2  31.2750        S
  Pclass  Sex   Age SibSp Parch  Fare  Embarked
0      3    0  92.0     0     0     1         1
1      3    1  22.0     0     0    12         0
2      3    0  47.0     1     0     4         0
3      2    0  21.0     1     0    17         0
4      2    1  29.0     1     0    17         0
5      3    0  22.0     0     0     7         0
     0    1     2    3    4     5    6
0  3.0  0.0  92.0  0.0  0.0   1.0  1.0
1  3.0  1.0  22.0  0.0  0.0  12.0  0.0
2  3.0  0.0  47.0  1.0  0.0   4.0  0.0
3  2.0  0.0  21.0  1.0  0.0  17.0  0.0
4  2.0  1.0  29.0  1.0  0.0  17.0  0.0
5  3.0  0.0  22.0  0.0  0.0   7.0  0.0
          0         1         2         3  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [75]:
##tuning parameters
# this requires A LOT of CPU power
## Best params  {'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 16, 'n_estimators': 700}
#param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10, 25, 50, 70], "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35], "n_estimators": [100, 400, 700, 1000, 1500]}
#rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
#clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
#clf.fit(X_train, y_train)
#print("Best params ",clf.best_params_)

##n_jobs = -1 to use all processors available
forest = RandomForestClassifier(n_estimators=1600, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
##checking most important features in our dataset
if show_data_info:
    importances_dict = dict(zip(labels, np.around(forest.feature_importances_, decimals=2))) 
    print(sorted(importances_dict.items(), key=lambda x: x[1], reverse=True))


forest_prediction = forest.predict(X_test)

print("Forest accuracy %2f" % accuracy_score(y_test, forest_prediction))

people_prediction = forest.predict(extra_dataframe)
for result in enumerate(people_prediction):
    print("%d) %s" % (result[0] , "Dead" if result[1] == 0 else "Survived"))


Forest accuracy 0.826816
0) Dead
1) Dead
2) Dead
3) Survived
4) Dead
5) Survived
