## Importing the libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from time import time
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

## Importing the dataset

In [2]:
dataset = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X = dataset.drop(columns=['Survived'])
Y = dataset.iloc[:,1]

## Check if there any missing values 

In [3]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Handling Missing Values

In [5]:
print(X.shape)
print(test.shape)

(891, 11)
(418, 11)


In [6]:
new_X=pd.concat([X, test])
print(new_X.shape)

(1309, 11)


In [7]:
new_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


In [9]:
num=new_X.select_dtypes(include=['int64','float64'])
num1=num.columns
print(num.columns)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [10]:
cat=new_X.select_dtypes(include='object')
cat1=cat.columns
print(cat.columns)

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


In [11]:
imputer1 = SimpleImputer(missing_values=np.nan, strategy='median') 
imputer1.fit(new_X[num1])
new_X[num1] = imputer1.transform(new_X[num1])
imputer2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
imputer2.fit(new_X[cat1])
new_X[cat1] = imputer2.transform(new_X[cat1])

In [12]:
new_X.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Scaling & encoding 

In [14]:
""" sc = MinMaxScaler()
new_X[num1]=sc.fit_transform(new_X[num1])
new_X.head() """ 

' sc = MinMaxScaler()\nnew_X[num1]=sc.fit_transform(new_X[num1])\nnew_X.head() '

In [16]:
new_X.drop(columns=['PassengerId','Name','Cabin'],inplace=True)

In [17]:
new_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,TravelBuds,TravelAlone
0,3.0,male,22.0,1.0,0.0,A/5 21171,7.25,S,1.0,0
1,1.0,female,38.0,1.0,0.0,PC 17599,71.2833,C,1.0,0
2,3.0,female,26.0,0.0,0.0,STON/O2. 3101282,7.925,S,0.0,1
3,1.0,female,35.0,1.0,0.0,113803,53.1,S,1.0,0
4,3.0,male,35.0,0.0,0.0,373450,8.05,S,0.0,1


In [18]:
new_X= pd.get_dummies(new_X)

## Splitting the dataset into the Training set and Test set

In [19]:
X=new_X.iloc[:dataset.shape[0]]
test=new_X.iloc[dataset.shape[0]:]

In [20]:
print(X.shape)
print(test.shape)

(891, 941)
(418, 941)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [22]:
model=XGBClassifier(tree_method='gpu_hist',use_rmm=True)
model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, pred)))


Unoptimized model
------
Accuracy score on testing data: 0.8492


In [23]:
model=AdaBoostClassifier()
model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, pred)))

Unoptimized model
------
Accuracy score on testing data: 0.8212


In [24]:
model=GradientBoostingClassifier()
model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, pred)))

Unoptimized model
------
Accuracy score on testing data: 0.8380


In [25]:
model = lgb.LGBMClassifier(device_type='gpu')
model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, pred)))

Unoptimized model
------
Accuracy score on testing data: 0.8659


In [26]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, pred)))

Unoptimized model
------
Accuracy score on testing data: 0.7933


# Hyperparameter Tuning For XGBClassifier

In [33]:
clf=XGBClassifier(tree_method='gpu_hist',use_rmm=True)
#parameters = { 'n_estimators':range(10,200,10),'max_depth':range(1,5,1),'min_child_weight':range(1,5,1)}
parameters = { 'n_estimators':[20],'max_depth':[4],'min_child_weight':[1],'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(accuracy_score)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(estimator=clf,param_grid=parameters,scoring=scorer,n_jobs=-1,cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
#print("ROC-AUC score on testing data: {:.4f}".format(roc_auc_score(y_test, predictions)))
#print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
#print("Final ROC-AUC score on the testing data: {:.4f}".format(roc_auc_score(y_test, best_predictions)))
#print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

Unoptimized model
------
Accuracy score on testing data: 0.8492

Optimized Model
------
Final accuracy score on the testing data: 0.8492


In [35]:
best_clf.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': 0,
 'grow_policy': 'depthwise',
 'interaction_constraints': '',
 'learning_rate': 0.3,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 4,
 'max_leaves': 0,
 'min_child_weight': 1,
 'monotone_constraints': '()',
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'gpu_hist',
 'validate_parameters': 1,
 'verbosity': None,
 'use_rmm': True}

# Hyperparameter Tuning For RandomForest

In [36]:
clf=RandomForestClassifier(n_jobs=-1)
parameters = {'n_estimators': range(10,200,10),'max_depth':range(1,10,1),'min_samples_split':range(1,5,1),'min_samples_leaf':range(1,5,1)}
scorer = make_scorer(accuracy_score)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(estimator=clf,param_grid=parameters,scoring=scorer,n_jobs=-1,cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
#print("ROC-AUC score on testing data: {:.4f}".format(roc_auc_score(y_test, predictions)))
#print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
#print("Final ROC-AUC score on the testing data: {:.4f}".format(roc_auc_score(y_test, best_predictions)))
#print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

3420 fits failed out of a total of 13680.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3420 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Ruby\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Ruby\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "c:\Users\Ruby\anaconda3\lib\site-packages\joblib\parallel.py", line 1061, in __call__
    self.retrieve()
  File "c:\Users\Ruby\anaconda3\lib\site-packages\joblib\parallel.py", line 938, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "c:\Users\Ruby\anacond

Unoptimized model
------
Accuracy score on testing data: 0.8659

Optimized Model
------
Final accuracy score on the testing data: 0.8268


In [37]:
best_clf

In [90]:
#model=XGBClassifier(tree_method='gpu_hist',use_rmm=True,n_estimators=20,max_depth=4,min_child_weight=1,subsample=1,learning_rate=0.3)
#model=RandomForestClassifier(max_depth=8, min_samples_leaf=3, min_samples_split=4,n_jobs=-1)
model=RandomForestClassifier(max_depth=9, n_estimators=170, n_jobs=-1,random_state=42)
model.fit(X,Y)

In [91]:
pred=model.predict(test)

In [94]:
sub=pd.read_csv('gender_submission.csv')

In [95]:
sub.Survived=pred

In [96]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [97]:
sub.to_csv("RandomForestClassifier.csv",index=False)