In [1]:
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from sklearn.feature_extraction import DictVectorizer 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import scipy.stats as stats
import copy

In [2]:
train = pd.read_csv("/Users/ajaypatel21/Downloads/train.csv")
test = pd.read_csv("/Users/ajaypatel21/Downloads/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


**Data Cleaning**

In [5]:
train["Age"] = train["Age"].fillna(train["Age"].mean())
train["Cabin"] = train["Cabin"].fillna(train["Cabin"].value_counts().idxmax())
train["Ticket"] = train["Ticket"].fillna(train["Ticket"].value_counts().idxmax())
train = train.fillna(0)

**Random Foresting**

In [6]:
# temp = train.drop(["Cabin", "Ticket"], axis=1)
# temp = temp.dropna()
# X = temp.drop("Survived", axis=1)
# y = temp["Survived"]

X = train.drop("Survived", axis=1)
y = train["Survived"]

predictions = []
f1_scores = []
yvals = []
mses = []

for i in range(30):
    Xtrain, Xval, ytrain, yval = train_test_split(X,y,test_size=0.10,shuffle=True)

    Xtrain_dict = Xtrain.to_dict(orient="records")
    Xval_dict = Xval.to_dict(orient="records")

    vec = DictVectorizer(sparse=False) 
    vec.fit(Xtrain_dict)
    Xtrain = vec.transform(Xtrain_dict)
    Xval = vec.transform(Xval_dict)

    scaler = StandardScaler()
    scaler.fit(Xtrain)
    Xtrain_sc = scaler.transform(Xtrain)
    Xval_sc = scaler.transform(Xval)
    
    rf = RandomForestClassifier(n_estimators=500)
    model = rf.fit(Xtrain_sc, ytrain)
    
    predictions.append(model.predict(Xval_sc))
    yvals.append(yval)
    mses.append(mean_squared_error(yval, model.predict(Xval_sc)))
    f1_scores.append(f1_score(yval, model.predict(Xval_sc)))

In [7]:
rf_df = pd.DataFrame(mses, columns=["RF mses"])
rf_df.describe()

Unnamed: 0,RF mses
count,30.0
mean,0.171111
std,0.042546
min,0.088889
25%,0.136111
50%,0.166667
75%,0.2
max,0.255556


In [8]:
f1_df = pd.DataFrame(f1_scores, columns=["RF f1s"])
f1_df.describe()

Unnamed: 0,RF f1s
count,30.0
mean,0.75566
std,0.063986
min,0.588235
25%,0.721539
50%,0.762644
75%,0.803893
max,0.862069


**RF Testing**

In [9]:
test["Age"] = test["Age"].fillna(test["Age"].mean())
test["Cabin"] = test["Cabin"].fillna(test["Cabin"].value_counts().idxmax())
test["Ticket"] = test["Ticket"].fillna(test["Ticket"].value_counts().idxmax())
test = test.fillna(0)

In [10]:
Xtrain = X
ytrain = y
Xtest = test

Xtrain = Xtrain.to_dict(orient="records")
Xtest = Xtest.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain)
Xtrain = vec.transform(Xtrain)
Xtest = vec.transform(Xtest)

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_sc = scaler.transform(Xtrain)
Xtest_sc = scaler.transform(Xtest)

rf = RandomForestClassifier(n_estimators=500)
model1 = rf.fit(Xtrain_sc, ytrain)

In [11]:
predictions = model1.predict(Xtest_sc)

In [12]:
submission = pd.DataFrame(test["PassengerId"]).join(pd.DataFrame(predictions, columns=["Survived"]))

In [13]:
submission.to_csv("submission1", header=True, index=False)

**Gradient Boosting**

In [14]:
from sklearn import ensemble
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01}
gb_1 = ensemble.GradientBoostingClassifier(**params)

params = {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 2,
          'learning_rate': 0.01}
gb_2 = ensemble.GradientBoostingClassifier(**params)

In [15]:
X = train.drop("Survived", axis=1)
y = train["Survived"]
Xtrain, Xval, ytrain, yval = train_test_split(X,y,test_size=0.10,shuffle=True)

Xtrain_dict = Xtrain.to_dict(orient="records")
Xval_dict = Xval.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain_dict)
Xtrain = vec.transform(Xtrain_dict)
Xval = vec.transform(Xval_dict)

m = gb_1.fit(Xtrain, ytrain)

In [16]:
X = train.drop("Survived", axis=1)
y = train["Survived"]

predictions = []
f1_scores = []
yvals = []
mses = []

for i in range(30):
    Xtrain, Xval, ytrain, yval = train_test_split(X,y,test_size=0.10,shuffle=True)

    Xtrain_dict = Xtrain.to_dict(orient="records")
    Xval_dict = Xval.to_dict(orient="records")

    vec = DictVectorizer(sparse=False) 
    vec.fit(Xtrain_dict)
    Xtrain = vec.transform(Xtrain_dict)
    Xval = vec.transform(Xval_dict)

    scaler = StandardScaler()
    scaler.fit(Xtrain)
    Xtrain_sc = scaler.transform(Xtrain)
    Xval_sc = scaler.transform(Xval)
    
    for model in [gb_1, gb_2]:
        m = model.fit(Xtrain_sc, ytrain)
        predictions.append(m.predict(Xval_sc))
        yvals.append(yval)
        mses.append(mean_squared_error(yval, m.predict(Xval_sc)))
        f1_scores.append(f1_score(yval, m.predict(Xval_sc)))

In [17]:
def make_dfs(first_num, mses, model_name):
    temp = []
    for i in range(first_num, 60, 2):
        temp.append(mses[i])
        df = pd.DataFrame(temp, columns=[model_name])
    return df

In [18]:
gb1_mse = make_dfs(0, mses, "gb1_mses")
gb1_f1 = make_dfs(0, f1_scores, "gb1_f1s")
gb2_mse = make_dfs(1, mses, "gb2_mses")
gb2_f1 = make_dfs(1, f1_scores, "gb2_f1s")

In [19]:
errors = gb1_mse.join(gb1_f1, how="left").join(gb2_mse, how="left").join(gb2_f1, how="left")
errors.describe()

Unnamed: 0,gb1_mses,gb1_f1s,gb2_mses,gb2_f1s
count,30.0,30.0,30.0,30.0
mean,0.192222,0.723244,0.182593,0.741523
std,0.048926,0.064044,0.042554,0.059439
min,0.077778,0.610169,0.133333,0.633333
25%,0.155556,0.686161,0.144444,0.703239
50%,0.188889,0.72,0.172222,0.743849
75%,0.222222,0.764439,0.211111,0.785824
max,0.277778,0.898551,0.266667,0.839506


**Gradient Boosting Testing**

In [21]:
Xtrain = X
ytrain = y
Xtest = test

Xtrain = Xtrain.to_dict(orient="records")
Xtest = Xtest.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain)
Xtrain = vec.transform(Xtrain)
Xtest = vec.transform(Xtest)

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_sc = scaler.transform(Xtrain)
Xtest_sc = scaler.transform(Xtest)

gb2 = ensemble.GradientBoostingClassifier(**params)

In [22]:
model2 = gb2.fit(Xtrain_sc, ytrain)

In [23]:
model2.predict(Xtest_sc) - model1.predict(Xtest_sc)

array([ 0, -1,  0,  0,  0,  1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1, -1,  1,  0, -1,  0,  0,  0,  0,  1,  0,  0,  0,  0, -1,
       -1,  0, -1, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0,  0,  0, -1,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  1,  0,  0,  0,  1,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0, -1,  0,  0, -1,  0,
        0,  0, -1,  0,  0,  0,  0,  0,  1,  0, -1,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0

**Ensemble RF and GB**

In [24]:
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

Xtrain = X
ytrain = y

Xtrain = Xtrain.to_dict(orient="records")
# Xtest = Xtest.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain)
Xtrain = vec.transform(Xtrain)
# Xtest = vec.transform(Xtest)

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_sc = scaler.transform(Xtrain)
# Xtest_sc = scaler.transform(Xtest)

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the sub models
estimators = []
estimators.append(('rf', model1))
estimators.append(('gb', model2))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, Xtrain_sc, ytrain, cv=kfold)
print(results.mean())

0.8204993757802745


In [25]:
Xtest = test
Xtest = Xtest.to_dict(orient="records")
Xtest = vec.transform(Xtest)
Xtest_sc = scaler.transform(Xtest)

model3 = ensemble.fit(Xtrain_sc, ytrain)
model3.predict(Xtest_sc)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [26]:
predictions = model3.predict(Xtest_sc)
submission = pd.DataFrame(test["PassengerId"]).join(pd.DataFrame(predictions, columns=["Survived"]))
submission.to_csv("submission2", header=True, index=False)

**Logit**

In [29]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

X = train.drop("Survived", axis=1)
y = train["Survived"]

predictions = []
f1_scores = []
yvals = []
mses = []

for i in range(30):
    Xtrain, Xval, ytrain, yval = train_test_split(X,y,test_size=0.10,shuffle=True)

    Xtrain_dict = Xtrain.to_dict(orient="records")
    Xval_dict = Xval.to_dict(orient="records")

    vec = DictVectorizer(sparse=False) 
    vec.fit(Xtrain_dict)
    Xtrain = vec.transform(Xtrain_dict)
    Xval = vec.transform(Xval_dict)

    scaler = StandardScaler()
    scaler.fit(Xtrain)
    Xtrain_sc = scaler.transform(Xtrain)
    Xval_sc = scaler.transform(Xval)
      
    log_reg = LogisticRegression()
    model = log_reg.fit(Xtrain_sc, ytrain)
    
    predictions.append(model.predict(Xval_sc))
    yvals.append(yval)
    mses.append(mean_squared_error(yval, model.predict(Xval_sc)))
    f1_scores.append(f1_score(yval, model.predict(Xval_sc)))

In [30]:
rf_df = pd.DataFrame(mses, columns=["Logit mses"])
rf_df.describe()

Unnamed: 0,Logit mses
count,30.0
mean,0.189259
std,0.036734
min,0.1
25%,0.169444
50%,0.2
75%,0.211111
max,0.244444


In [31]:
f1_df = pd.DataFrame(f1_scores, columns=["Logit f1s"])
f1_df.describe()

Unnamed: 0,Logit f1s
count,30.0
mean,0.741731
std,0.059377
min,0.62069
25%,0.707414
50%,0.753008
75%,0.777363
max,0.861538


**Logit Testing**

In [32]:
Xtrain = X
ytrain = y
Xtest = test

Xtrain = Xtrain.to_dict(orient="records")
Xtest = Xtest.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain)
Xtrain = vec.transform(Xtrain)
Xtest = vec.transform(Xtest)

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_sc = scaler.transform(Xtrain)
Xtest_sc = scaler.transform(Xtest)

logit = LogisticRegression()

In [33]:
model4 = logit.fit(Xtrain_sc, ytrain)

**Ensemble RF, GB, Logit**

In [35]:
Xtrain = X
ytrain = y

Xtrain = Xtrain.to_dict(orient="records")
# Xtest = Xtest.to_dict(orient="records")

vec = DictVectorizer(sparse=False) 
vec.fit(Xtrain)
Xtrain = vec.transform(Xtrain)
# Xtest = vec.transform(Xtest)

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_sc = scaler.transform(Xtrain)
# Xtest_sc = scaler.transform(Xtest)

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the sub models
estimators = []
estimators.append(('rf', model1))
estimators.append(('gb', model2))
estimators.append(('lg', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators, voting="hard")
results = model_selection.cross_val_score(ensemble, Xtrain_sc, ytrain, cv=kfold)
print(results.mean())

0.8238202247191012


In [36]:
Xtest = test
Xtest = Xtest.to_dict(orient="records")
Xtest = vec.transform(Xtest)
Xtest_sc = scaler.transform(Xtest)

model5 = ensemble.fit(Xtrain_sc, ytrain)
model5.predict(Xtest_sc)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [37]:
predictions = model5.predict(Xtest_sc)
submission = pd.DataFrame(test["PassengerId"]).join(pd.DataFrame(predictions, columns=["Survived"]))
submission.to_csv("submission3", header=True, index=False)