In [827]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
%matplotlib inline

In [828]:
data = pd.read_csv('titanic.csv')
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [829]:
# حذف ستون‌های غیرضروری
data = data.drop(['PassengerId','Ticket', 'Cabin'], axis=1)

In [830]:
fill_values = {
    'Embarked': data['Embarked'].mode()[0] if not data['Embarked'].mode().empty else 'S',
    'Age': data['Age'].median(),
    'Fare': data['Fare'].median()
}
data.fillna(fill_values, inplace=True)

In [831]:
data['Sex'] = data['Sex'].map({'male' : 0, 'female' : 1})
data['Embarked'] = data['Embarked'].map({'S' : 0, 'C' : 1, 'Q' : 2})

In [832]:
#جداکردن عناوین از اسم(Mr, Miss ....)
data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

In [833]:
# جایگزینی عناوین نادر با 'Rare'
rare_titles = ['Lady', 'Countess', 'Rev', 'Dr', 'Major', 'Col', 'Sir']
data['Title'] = data['Title'].replace(rare_titles, 'Rare')
data['Title'] = data['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

In [834]:
# تبدیل عنوان به عدد
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4}
data['Title'] = data['Title'].map(title_mapping)

In [835]:
# تبدیل به عدد
bins = [0, 5, 12, 18, 30, 50, 100]
labels = ['Infant', 'Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)
data['AgeGroup'] = data['AgeGroup'].cat.codes

In [836]:
# ترکیب Title و AgeGroup
data['TitleAge'] = data['Title'].astype(str) + "_" + data['AgeGroup'].astype(str)
data['TitleAge'] = data['TitleAge'].astype('category').cat.codes 

In [837]:
# تعامل بین کلاس خدمات و جنسیت
data['Pclass_Sex'] = data['Pclass'].astype(str) + "_" + data['Sex'].astype(str)
data['Pclass_Sex'] = data['Pclass_Sex'].astype('category').cat.codes

In [838]:
data['FamilySize'] = data['SibSp'] + data['Parch']
data['IsAlone'] = (data['FamilySize'] == 0).astype(int)

In [839]:
# ویژگی‌های نهایی پس از بهبود
features = ['Pclass', 'Sex', 'TitleAge', 'Fare', 'IsAlone', 'Pclass_Sex']
X = data[features]
y = data['Survived']

In [840]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2, k=5)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("ویژگی‌های منتخب:", selected_features)

ویژگی‌های منتخب: Index(['Pclass', 'Sex', 'TitleAge', 'Fare', 'Pclass_Sex'], dtype='object')


In [841]:
# تقسیم داده
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [842]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train, ['Fare'])
X_test_scaled = scaler.transform(X_test, ['Fare'])

In [843]:
#بهترین پارامترها
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=8, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("بهترین پارامترها:", grid_search.best_params_)


بهترین پارامترها: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [844]:
# آموزش مدل
model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    criterion='gini',
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

In [845]:
# ارزیابی
y_pred = model.predict(X_test)
print("دقت نهایی:", accuracy_score(y_test, y_pred))

دقت نهایی: 0.8603351955307262


In [846]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_etimators':[50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [847]:

model = RandomForestClassifier(random_state=42)
gride_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    cv = 7 ,
    scoring='accuracy'
)

In [848]:
#بهترین پارامتر
best_params = grid_search.best_params_
print('Best Parameters:', best_params)
#بهترین مدل
best_model = grid_search.best_estimator_
#مدل نهایی
y_pred = best_model.predict(X_test)

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [894]:
final_model = RandomForestClassifier(
     n_estimators = 100,
     max_depth = 8,
     min_samples_split = 5,
     criterion = 'gini',
     random_state = 42,
)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("دقت نهایی:", accuracy_score(y_test, y_pred))

دقت نهایی: 0.8659217877094972
