In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import  accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier,IsolationForest,VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [2]:
train = pd.read_csv("Desktop/train.csv")
test = pd.read_csv("Desktop/test.csv")

In [3]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [5]:
testid = test["PassengerId"]

In [6]:
train = train.drop(["PassengerId"],axis=1)
test = test.drop(["PassengerId"],axis=1)

In [7]:
train_len = len(train)

In [8]:
fulldata = pd.concat([train,test],axis=0).reset_index(drop=True)

In [9]:
fulldata.isnull().sum()

Survived     418
Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [10]:
fulldata.Age.fillna(fulldata.Age.mean(),inplace=True)
fulldata.Fare.fillna(fulldata.Fare.mean(),inplace=True)
fulldata.Cabin.fillna(fulldata.Cabin.mode()[0],inplace=True)
fulldata.Embarked.fillna(fulldata.Embarked.mode()[0],inplace=True)
fulldata["family"]=fulldata["SibSp"]+fulldata["Parch"]+1

In [11]:
fulldata.family.value_counts()

1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: family, dtype: int64

In [12]:
fulldata["Single"] = fulldata["family"].map(lambda x : 1 if x==1 else 0)
fulldata["Small"] = fulldata["family"].map(lambda x : 1 if x==2 else 0)
fulldata["Medium"] = fulldata["family"].map(lambda x : 1 if 3<=x<=4 else 0)
fulldata["Large"] = fulldata["family"].map(lambda x : 1 if x>=5 else 0)

In [13]:
fulldata.head(7)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family,Single,Small,Medium,Large
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C23 C25 C27,S,2,0,1,0,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,1,0,0
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C23 C25 C27,S,1,1,0,0,0
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,1,0,0
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C23 C25 C27,S,1,1,0,0,0
5,0.0,3,"Moran, Mr. James",male,29.881138,0,0,330877,8.4583,C23 C25 C27,Q,1,1,0,0,0
6,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0,0


In [14]:
fulldata["Sex"]=fulldata["Sex"].map(lambda x : 1 if x == "male" else 0 )

In [15]:
fulldata["Embarked"]=fulldata["Embarked"].map({"S":0,"C":1,"Q":2})

In [16]:
fulldata_title = [i.split(",")[1].split(".")[0].strip() for i in fulldata["Name"]]
fulldata["Title"]=pd.Series(fulldata_title)
fulldata.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family,Single,Small,Medium,Large,Title
0,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,C23 C25 C27,0,2,0,1,0,0,Mr
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,2,0,1,0,0,Mrs
2,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,C23 C25 C27,0,1,1,0,0,0,Miss
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,0,2,0,1,0,0,Mrs
4,0.0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,C23 C25 C27,0,1,1,0,0,0,Mr


In [17]:
fulldata.drop(["Name","Ticket"],axis=1,inplace=True)
fulldata.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Ms                2
Mlle              2
Major             2
Mme               1
Capt              1
the Countess      1
Don               1
Dona              1
Jonkheer          1
Sir               1
Lady              1
Name: Title, dtype: int64

In [18]:
fulldata["Title"] = fulldata["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
fulldata["Title"] = fulldata["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
fulldata["Title"] = fulldata["Title"].astype(int)

In [19]:
fulldata.dtypes

Survived    float64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked      int64
family        int64
Single        int64
Small         int64
Medium        int64
Large         int64
Title         int32
dtype: object

In [20]:
fulldata["Pclass"] = fulldata["Pclass"].astype("category")
fulldata = pd.get_dummies(fulldata, columns = ["Pclass"],prefix="Pc")


In [21]:
fulldata.head(4)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,family,Single,Small,Medium,Large,Title,Pc_1,Pc_2,Pc_3
0,0.0,1,22.0,1,0,7.25,C23 C25 C27,0,2,0,1,0,0,2,0,0,1
1,1.0,0,38.0,1,0,71.2833,C85,1,2,0,1,0,0,1,1,0,0
2,1.0,0,26.0,0,0,7.925,C23 C25 C27,0,1,1,0,0,0,1,0,0,1
3,1.0,0,35.0,1,0,53.1,C123,0,2,0,1,0,0,1,1,0,0


In [22]:
fulldata.dtypes

Survived    float64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked      int64
family        int64
Single        int64
Small         int64
Medium        int64
Large         int64
Title         int32
Pc_1          uint8
Pc_2          uint8
Pc_3          uint8
dtype: object

In [23]:
fulldata.drop(["Cabin"],axis=1,inplace=True)

In [24]:
train = fulldata[:train_len]
test = fulldata[train_len:]
test.drop(labels=["Survived"],axis = 1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [25]:
y = train.Survived.values
x = train.drop(["Survived"],axis=1)

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [27]:
xgb = XGBClassifier().fit(x_train,y_train)
lightgbm = LGBMClassifier().fit(x_train,y_train)
extratree = ExtraTreesClassifier().fit(x_train,y_train)
catboost = CatBoostClassifier().fit(x_train,y_train)
ada = AdaBoostClassifier().fit(x_train,y_train)
grad = GradientBoostingClassifier().fit(x_train,y_train)
log = LogisticRegression().fit(x_train,y_train)
gauss = GaussianNB().fit(x_train,y_train)
rf = RandomForestClassifier().fit(x_train,y_train)
neural = MLPClassifier().fit(x_train,y_train)
bagging = BaggingClassifier().fit(x_train,y_train)
modeller = [xgb,lightgbm,extratree,catboost,ada,grad,log,gauss,rf,neural,bagging]




Learning rate set to 0.008911
0:	learn: 0.6863458	total: 58.4ms	remaining: 58.4s
1:	learn: 0.6813640	total: 60.8ms	remaining: 30.3s
2:	learn: 0.6752856	total: 63ms	remaining: 20.9s
3:	learn: 0.6685849	total: 65.4ms	remaining: 16.3s
4:	learn: 0.6618261	total: 67.6ms	remaining: 13.4s
5:	learn: 0.6551895	total: 69.8ms	remaining: 11.6s
6:	learn: 0.6495070	total: 72.3ms	remaining: 10.3s
7:	learn: 0.6433626	total: 74.5ms	remaining: 9.24s
8:	learn: 0.6374791	total: 77.4ms	remaining: 8.52s
9:	learn: 0.6321251	total: 80.1ms	remaining: 7.93s
10:	learn: 0.6283303	total: 82.7ms	remaining: 7.44s
11:	learn: 0.6236214	total: 86ms	remaining: 7.08s
12:	learn: 0.6181702	total: 88.5ms	remaining: 6.72s
13:	learn: 0.6135448	total: 90.8ms	remaining: 6.39s
14:	learn: 0.6086632	total: 92.8ms	remaining: 6.09s
15:	learn: 0.6032083	total: 95.4ms	remaining: 5.87s
16:	learn: 0.5982976	total: 98ms	remaining: 5.67s
17:	learn: 0.5937047	total: 101ms	remaining: 5.5s
18:	learn: 0.5907252	total: 103ms	remaining: 5.32s
1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [31]:
for model in modeller:
    isimler = model.__class__.__name__
    pred = model.predict(x_test)
    acc = accuracy_score(pred,y_test)
    print(isimler + "---> " + "accuracy : {:.2%}".format(acc) )


XGBClassifier---> accuracy : 78.21%
LGBMClassifier---> accuracy : 78.21%
ExtraTreesClassifier---> accuracy : 75.98%
CatBoostClassifier---> accuracy : 79.33%
AdaBoostClassifier---> accuracy : 78.77%
GradientBoostingClassifier---> accuracy : 80.45%
LogisticRegression---> accuracy : 78.77%
GaussianNB---> accuracy : 77.09%
RandomForestClassifier---> accuracy : 77.65%
MLPClassifier---> accuracy : 77.09%
BaggingClassifier---> accuracy : 78.21%


Gradient Boosting

In [49]:
Grad = GradientBoostingClassifier().fit(x_train,y_train)

In [50]:
test_Survived = pd.Series(Grad.predict(test), name="Survived")

results = pd.concat([testid,test_Survived],axis=1)



In [51]:
results.dtypes

PassengerId      int64
Survived       float64
dtype: object

In [52]:
results.Survived = results.Survived.astype("int64")

In [53]:
results.to_csv("submission2.csv",index=False)
