In [124]:
# Let's import the standard modules...

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [125]:
# Let's view our data. 

dataset  = pd.read_csv('data.csv')

# Calculate the sum of null values for each column
null_counts = dataset.isnull().sum()

# Display the null value counts for each column
print(null_counts)



PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [126]:


# Replace missing 'Age' values with the median age
median_age = dataset['Age'].median()
dataset['Age'].fillna(median_age, inplace=True)

# Drop the 'Cabin' column
dataset.drop(columns=['Cabin'], inplace=True)

# Drop rows with missing values in the 'Embarked' column
dataset.dropna(subset=['Fare'], inplace=True)
dataset.dropna(subset=['Embarked'], inplace=True)

# Check for null values after making these changes
null_counts = dataset.isnull().sum()

# Display the null value counts for each column
print(null_counts)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [127]:
print

<function print>

In [128]:
X = dataset.drop(columns=['Survived', 'Name', 'PassengerId'])

# Extract the 'Survived' column into y
y = dataset['Survived']

# Assuming you have already created X as the feature matrix
print(X.head)



<bound method NDFrame.head of       Pclass     Sex   Age  SibSp  Parch              Ticket      Fare  \
0          3    male  22.0      1      0           A/5 21171    7.2500   
1          1  female  38.0      1      0            PC 17599   71.2833   
2          3  female  26.0      0      0    STON/O2. 3101282    7.9250   
3          1  female  35.0      1      0              113803   53.1000   
4          3    male  35.0      0      0              373450    8.0500   
...      ...     ...   ...    ...    ...                 ...       ...   
1304       3    male  28.0      0      0           A.5. 3236    8.0500   
1305       1  female  39.0      0      0            PC 17758  108.9000   
1306       3    male  38.5      0      0  SOTON/O.Q. 3101262    7.2500   
1307       3    male  28.0      0      0              359309    8.0500   
1308       3    male  28.0      1      1                2668   22.3583   

     Embarked  
0           S  
1           C  
2           S  
3           S  
4

In [129]:
import pandas as pd

# Assuming you have X as your feature matrix
# Extract all columns with less than 10 unique values
X_with_few_uniques = X.loc[:, X.nunique() < 10]

# Get value counts for each selected column
for column in X_with_few_uniques.columns:
    counts = X_with_few_uniques[column].value_counts()
    print(f"Value counts for column '{column}':\n{counts}\n")


Value counts for column 'Pclass':
Pclass
3    708
1    321
2    277
Name: count, dtype: int64

Value counts for column 'Sex':
Sex
male      842
female    464
Name: count, dtype: int64

Value counts for column 'SibSp':
SibSp
0    888
1    319
2     42
4     22
3     20
8      9
5      6
Name: count, dtype: int64

Value counts for column 'Parch':
Parch
0    999
1    170
2    113
3      8
5      6
4      6
6      2
9      2
Name: count, dtype: int64

Value counts for column 'Embarked':
Embarked
S    913
C    270
Q    123
Name: count, dtype: int64



In [130]:
categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

# Categorically encode the selected columns
for column in categorical_columns:
    X[column] = X[column].astype('category')
# Assuming 'Embarked' is a categorical column in your 'X' DataFrame
X = pd.get_dummies(X, columns=['Embarked'], prefix=['Embarked'], drop_first=True)

# Assuming 'Embarked' is a categorical column in your 'X' DataFrame
X = pd.get_dummies(X, columns=['Sex'], prefix=['Sex'], drop_first=True)


print(X.dtypes)

Pclass        category
Age            float64
SibSp         category
Parch         category
Ticket          object
Fare           float64
Embarked_Q        bool
Embarked_S        bool
Sex_male          bool
dtype: object


In [131]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Scale the 'Age' and 'Fare' columns
X[['Age', 'Fare']] = scaler.fit_transform(X[['Age', 'Fare']])


In [132]:
unique_tickets = X['Ticket'].unique()
print("Unique values in the 'Ticket' column:")
for ticket in unique_tickets:
    print(ticket)

Unique values in the 'Ticket' column:
A/5 21171
PC 17599
STON/O2. 3101282
113803
373450
330877
17463
349909
347742
237736
PP 9549
113783
A/5. 2151
347082
350406
248706
382652
244373
345763
2649
239865
248698
330923
113788
347077
2631
19950
330959
349216
PC 17601
PC 17569
335677
C.A. 24579
PC 17604
113789
2677
A./5. 2152
345764
2651
7546
11668
349253
SC/Paris 2123
330958
S.C./A.4. 23567
370371
14311
2662
349237
3101295
A/4. 39886
PC 17572
2926
113509
19947
C.A. 31026
2697
C.A. 34651
CA 2144
2669
36973
347088
PC 17605
2661
C.A. 29395
S.P. 3464
3101281
315151
C.A. 33111
S.O.C. 14879
2680
1601
348123
349208
374746
248738
364516
345767
345779
330932
113059
SO/C 14885
3101278
W./C. 6608
SOTON/OQ 392086
343275
343276
347466
W.E.P. 5734
C.A. 2315
364500
374910
PC 17754
PC 17759
231919
244367
349245
349215
35281
7540
3101276
349207
343120
312991
349249
371110
110465
2665
324669
4136
2627
STON/O 2. 3101294
370369
PC 17558
A4. 54510
27267
370372
C 17369
2668
347061
349241
SOTON/O.Q. 3101307
A/5. 

In [133]:
# Drop that shit

X.drop(columns=['Ticket'], inplace=True)
print(X.head)


<bound method NDFrame.head of      Pclass       Age SibSp Parch      Fare  Embarked_Q  Embarked_S  Sex_male
0         3 -0.579484     1     0 -0.501949       False        True      True
1         1  0.665369     1     0  0.735501       False       False     False
2         3 -0.268271     0     0 -0.488905       False        True     False
3         1  0.431959     1     0  0.384107       False        True     False
4         3  0.431959     0     0 -0.486489       False        True      True
...     ...       ...   ...   ...       ...         ...         ...       ...
1304      3 -0.112664     0     0 -0.486489       False        True      True
1305      1  0.743172     0     0  1.462447       False       False     False
1306      3  0.704271     0     0 -0.501949       False        True      True
1307      3 -0.112664     0     0 -0.486489       False        True      True
1308      3 -0.112664     1     1 -0.209980       False       False      True

[1306 rows x 8 columns]>


In [134]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split your dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Fit the model to the training data
rf_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_regressor.predict(X_test)


# Round the predicted values to 1 or 0 using a threshold (e.g., 0.5)
y_pred_rounded = (y_pred > 0.5).astype(int)

# Calculate classification accuracy
accuracy = accuracy_score(y_test, y_pred_rounded)

# Convert accuracy to percentage
accuracy_percentage = accuracy * 100

print("Classification Accuracy:", accuracy_percentage, "%")



Classification Accuracy: 84.7328244274809 %


In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the range of hyperparameters to sample from
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)

# Perform the randomized search
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

# Fit the model with the best hyperparameters
best_estimator.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_estimator.predict(X_test)

# Set the threshold for rounding predictions (0.5 for rounding to 1 or 0)
threshold = 0.5
y_pred_rounded = (y_pred >= threshold).astype(int)

# Evaluate the model
accuracy = (y_pred_rounded == y_test).mean()

print("Best Hyperparameters:", best_params)
print("Test Accuracy with Best Hyperparameters:", accuracy*100)


Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}
Test Accuracy with Best Hyperparameters: 88.16793893129771
