## 1️⃣ **Importing Libraries and Dataset**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore")

## 2️⃣ **Reading Data**

In [None]:
data_train=pd.read_csv('/kaggle/input/titanic/train.csv')
data_test=pd.read_csv('/kaggle/input/titanic/test.csv')
data = pd.concat([data_train, data_test], axis=0)
data.head()

## 3️⃣ **Data PreProcessing**  

In [None]:
data['Title'] = data['Name'].str.extract(r'([A-Za-z]+)\.')

title_class_numeric = {
    'Don': 5, 'Sir': 5, 'Lady': 5, 'Countess': 5, 'Jonkheer': 5,  # Nobility
    'Major': 4, 'Col': 4, 'Capt': 4,                              # Military
    'Rev': 3,                                                    # Clergy
    'Dr': 2,                                                     # Professional
    'Mr': 1, 'Mrs': 1, 'Miss': 1, 'Master': 1, 'Ms': 1, 'Mme': 1, 'Mlle': 1,  # Commoner
}

data['TitleClass'] = data['Title'].map(title_class_numeric)
data['TitleClass'] = data['TitleClass'].fillna(0)
data['TitleClass'] = data['TitleClass'].astype(float)

print(data[['Title', 'TitleClass']])


#### 3.****1 coding for plot****

In [None]:
def graph(i):
    sns.histplot(data[i], kde=True, color='skyblue')
    plt.xlabel(i)
    plt.xticks(rotation=90)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()
    sns.barplot(x=i, y='Survived', data=data, color='skyblue')
    plt.xlabel(i)
    plt.ylabel('Survival Rate')
    plt.xticks(rotation=90)
    plt.show()

#### 3.****2 checking relation between Survival and Title****

In [None]:
graph('Title')

In [None]:
data['TicketNumber'] = data['Ticket'].apply(lambda x: ''.join(filter(str.isdigit, x)) if isinstance(x, str) else '0') 
data['TicketNumber'] = data['TicketNumber'].replace('', np.nan)

data['TicketNumber'] = data['TicketNumber'].astype(float) 

print(data[['Ticket', 'TicketNumber']])


In [None]:
data=data.drop(columns=['Name'])

In [None]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

#### 3.****3 checking relation between Survival and FamilySize****

In [None]:
graph('FamilySize')

In [None]:
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
data['CabinKnown'] = data['Cabin'].notnull().astype(int)

In [None]:
data.head()

In [None]:
data["Age"] = data["Age"].fillna(data["Age"].median())

In [None]:
def better_age_group(age):
    if pd.isnull(age):
        return 'Unknown'
    elif age <= 12:
        return 'Child'
    elif age <= 17:
        return 'Teenager'
    elif age <= 25:
        return 'YoungAdult'
    elif age <= 40:
        return 'Adult'
    elif age <= 59:
        return 'MidAge'
    else:
        return 'Senior'

data['AgeGroup'] = data['Age'].apply(better_age_group)


#### 3.****4 checking relation between Survival and AgeGroup****

In [None]:
graph('AgeGroup')

In [None]:
data.isnull().sum()

In [None]:
data['Sex'] = (data['Sex'] == 'male').astype(int)

In [None]:
data.Embarked.nunique()

In [None]:
data['Deck'] = data['Cabin'].str[0]
data['Deck'] = data['Deck'].fillna(0)

In [None]:
data['Cabin'] = data['Cabin'].str.extract('(\d+)', expand=False)
data['Cabin'] = data['Cabin'].fillna(0)

In [None]:
data['Ticket'] = data['Ticket'].apply(lambda x: ''.join([i for i in x if i.isalpha()]) if any(i.isalpha() for i in x) else '0')

In [None]:
data.head()

In [None]:
data=data.fillna(0)

In [None]:

encoder = OneHotEncoder(sparse_output=False)

columns_to_encode = ['Deck', 'Ticket','Embarked','Title','AgeGroup']

for column in columns_to_encode:
    
    data[column] = data[column].astype(str)
    
    encoded = encoder.fit_transform(data[[column]])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column]))
    encoded_df = encoded_df.reset_index(drop=True)
    data = data.reset_index(drop=True)
    data = pd.concat([data, encoded_df], axis=1)
    data = data.drop(column, axis=1)


In [None]:
data.head()

In [None]:
data = data.astype(float)

In [None]:
data.PassengerId

In [None]:
data_to_test = data.iloc[891:]
df = data.iloc[:891]

In [None]:
df.columns

## 4️⃣ **Analysing Data** 

In [None]:
sns.barplot(x='Pclass', y='Fare', data=df)
plt.show()

In [None]:
for i in ['Embarked','Pclass','Age','SibSp','Parch','Fare'] :
    
    sns.histplot(data_train[i], kde=True, color='skyblue')
    plt.xlabel(i)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
sns.countplot(x=data_train['Sex'])
plt.xlabel('Sex')
plt.ylabel('count')
plt.show()

In [None]:
sns.heatmap(df[['Survived',	'Pclass',	'Sex',	'Age',	'SibSp',	'Parch']].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
for i in['Embarked','Pclass','Sex','SibSp','Parch']:
    sns.barplot(x=i, y='Survived', data=data_train, color='skyblue')
    plt.xlabel(i)
    plt.ylabel('Survival Rate')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
(df['Survived'].value_counts())


In [None]:
X=df.drop(columns='Survived')
y=df.Survived

## 5️⃣ **Train Test Split** 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

features_to_scale = ['Age', 'Fare', 'Cabin', 'TicketNumber']

scaler = StandardScaler()

X_trains = X_train.copy()
X_tests= X_test.copy()

scaler.fit(X_trains[features_to_scale])


X_trains[features_to_scale] = scaler.transform(X_trains[features_to_scale])
X_tests[features_to_scale] = scaler.transform(X_tests[features_to_scale])


scaler2 = StandardScaler()
scaler2.fit(data_to_test[features_to_scale])
data_to_test[features_to_scale] = scaler.transform(data_to_test[features_to_scale])



In [None]:
(y_train.value_counts())


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_trains, y_train)
X_trains=X_resampled
y_train=y_resampled

In [None]:
(y_train.value_counts())


In [None]:
df.columns

## 6️⃣ **Analysis Result** 

***After analysis we can come into following results :***

1. There were more commoner men (Title Mr) in the ship but they had the lowest survival rate .

   Women and high class people were more likely to survive.

2. There were more people who didn't have their family with them .

   But people with FamilySize 4 had more survival rate . Alone persons were quite unlikely to survive

3. Most of the people were adults and lowest were seniors . 
   
   Children had the most survival rate and seniors had the lowest .

4. Most of the people Embarked from S but people embarked from C were most likely to survive .

5. Most of the people were from 3rd class . But they had the lowest survival rate .
 
   Where as firstclass people survived the most .

6. Most people had no sibllings,spouch,parents,children with them on board . 
   But people with 1 sibling/spouch and people with 3 parents/children had most survival rate .

7. There were more males on the ship than females but the females had more thandouble survival rate than men .

8. In the correlation map we can see the relation between different features .


## 7️⃣ **Checking different Model Results**


In [None]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
model1=DecisionTreeClassifier()
model2=RandomForestClassifier()
model3=KNeighborsClassifier(n_neighbors=5)
model4= XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model5=LogisticRegression(max_iter=1000)
model6=GradientBoostingClassifier()
model7=SVC()

In [None]:
model1.fit(X_trains,y_train)


In [None]:
model2.fit(X_trains,y_train)

In [None]:
model3.fit(X_trains,y_train)


In [None]:
model4.fit(X_trains, y_train)

In [None]:
model5.fit(X_trains, y_train)

In [None]:
model6.fit(X_trains, y_train)

In [None]:
model7.fit(X_trains, y_train)

In [None]:
y_pred=model1.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model2.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model3.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model4.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model5.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model6.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred=model7.predict(X_tests)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
for i in [0.01, 0.1, 1, 10]:
    for j in ['liblinear']:
        for k in [200, 500, 800,1000]:
            model51=LogisticRegression(C=i,solver=j,max_iter=k)
            model51.fit(X_trains, y_train)
            y_pred=model51.predict(X_tests)
            accuracy = accuracy_score(y_test, y_pred)
            
            scores = cross_val_score(model51, X, y, cv=5)
            print("Average CV accuracy:", scores.mean())
            print(accuracy)

In [None]:
model21 = RandomForestClassifier()
for i in [5, 10,12,15]:  # max_depth
    for j in [500, 800, 1000,1500,2000]:  # n_estimators
        for k in [2, 4, 6,8]:  # min_samples_split
            model21 = RandomForestClassifier(max_depth=i, n_estimators=j, min_samples_split=k)
            model21.fit(X_trains, y_train)
            y_pred = model21.predict(X_tests)
            accuracy = accuracy_score(y_test, y_pred)
            
            scores = cross_val_score(model21, X, y, cv=5)
            print(f'dep={i} ; est={j} ; min_split={k}')
            print("Average CV accuracy:", scores.mean())
            print("Test accuracy:", accuracy)

In [None]:
model61=GradientBoostingClassifier()
for i in [1, 0.1]:
    for j in [500,800,1000]:
        for k in [5,6, 7,8]:
            model61=GradientBoostingClassifier(learning_rate=i,n_estimators=j,max_depth=k)
            model61.fit(X_trains, y_train)
            y_pred=model61.predict(X_tests)
            accuracy = accuracy_score(y_test, y_pred)
            
            scores = cross_val_score(model61, X, y, cv=5)
            print(f'L={i} ; est={j} ; dep={k}')
            print("Average CV accuracy:", scores.mean())
            print(accuracy)           

In [None]:
for i in range(5):
    model02=GradientBoostingClassifier()
    model02=GradientBoostingClassifier(learning_rate=1,n_estimators=800,max_depth=7)
    model02.fit(X_trains, y_train)
    y_pred=model02.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model02, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model04=GradientBoostingClassifier()
    model04=GradientBoostingClassifier(learning_rate=0.1,n_estimators=1000,max_depth=8)
    model04.fit(X_trains, y_train)
    y_pred=model04.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model04, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model01=RandomForestClassifier()
    model01=RandomForestClassifier(max_depth=12, n_estimators=500, min_samples_split=2)
    model01.fit(X_trains, y_train)
    y_pred=model01.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model01, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model03=RandomForestClassifier()
    model03=RandomForestClassifier(max_depth=15, n_estimators=1000, min_samples_split=2)
    model03.fit(X_trains, y_train)
    y_pred=model03.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model03, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model05=RandomForestClassifier()
    model05=RandomForestClassifier(max_depth=15, n_estimators=2000, min_samples_split=2)
    model05.fit(X_trains, y_train)
    y_pred=model03.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model05, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model06=RandomForestClassifier()
    model06=RandomForestClassifier(max_depth=20, n_estimators=1000, min_samples_split=2)
    model06.fit(X_trains, y_train)
    y_pred=model06.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model06, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(5):
    model07=RandomForestClassifier()
    model07=RandomForestClassifier(max_depth=15, n_estimators=800, min_samples_split=2)
    model07.fit(X_trains, y_train)
    y_pred=model07.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model07, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(10):
    model03=RandomForestClassifier()
    model03=RandomForestClassifier(max_depth=15, n_estimators=1000, min_samples_split=2)
    model03.fit(X_trains, y_train)
    y_pred=model03.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model03, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

In [None]:
for i in range(10):
    model06=RandomForestClassifier()
    model06=RandomForestClassifier(max_depth=12, n_estimators=500, min_samples_split=2)
    model06.fit(X_trains, y_train)
    y_pred=model06.predict(X_tests)
    accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model06, X, y, cv=5)
    print("Average CV accuracy:", scores.mean())
    print(accuracy)
    i=i+1

#### ****we can see that model06 gives the best consistant result . so we will use this for our submission****

In [None]:
data_to_test=data_to_test.drop(columns='Survived')

In [None]:
data_to_test

In [None]:
model=RandomForestClassifier()
model=RandomForestClassifier(max_depth=12, n_estimators=500, min_samples_split=2)
model.fit(X_trains, y_train)
y_pred1=model.predict(data_to_test)

In [None]:
df = pd.DataFrame(y_pred1, columns=['Survived'])

In [None]:
data_to = pd.concat([df, data_test], axis=1)

In [None]:
data_to

In [None]:
data_to_submit = data_to[["Survived", "PassengerId"]]

In [None]:
data_to_submit

## **Submission**

In [None]:
data_to_submit=data_to_submit.astype(int)

In [None]:
data_to_submit = data_to_submit[['PassengerId', 'Survived']] 

In [None]:
data_to_submit

In [None]:
data_to_submit.to_csv("submissionn.csv",index=False)

In [None]:
import os
print(os.listdir())

In [None]:
print(data_to_submit.shape)

In [None]:
gg=pd.read_csv('/kaggle/working/submissionn.csv')

In [None]:
gg