## Import data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Download titanic data
df_titanic = sns.load_dataset("titanic")
df_titanic.info()
display(df_titanic)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## Create the database 

### put update_data copy to database

In [2]:
import sqlite3

# --- Database Setup ---
db_file = "titanic_data.db"
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS titanic_info (
        survived char(2) PRIMARY KEY,
        pclass char(2),
        sex varchar(10),
        age varchar(20),
        sibsp char(2),
        parch char(2),
        fare varchar(20),
        embarked char(2),
        class varvhar(10),
        adult_male varchar(10),
        deck varchar(10),
        embark_town varchar(20),
        alive varchar(5),
        alone varchar(10)
    )
''')
conn.commit()

df_titanic.to_sql('titanic_info', con=conn, index=False, if_exists='replace')
statement = "SELECT * FROM titanic_info"
rd_titanic = pd.read_sql_query(statement, conn)
rd_titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,1,,Southampton,no,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,0,C,Cherbourg,yes,0
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,0,,Southampton,yes,1
3,1,1,female,35.0,1,0,53.1000,S,First,woman,0,C,Southampton,yes,0
4,0,3,male,35.0,0,0,8.0500,S,Third,man,1,,Southampton,no,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,1,,Southampton,no,1
887,1,1,female,19.0,0,0,30.0000,S,First,woman,0,B,Southampton,yes,1
888,0,3,female,,1,2,23.4500,S,Third,woman,0,,Southampton,no,0
889,1,1,male,26.0,0,0,30.0000,C,First,man,1,C,Cherbourg,yes,1


## Data change type

In [3]:
# The data had some missing so using average of a set of numbers to fill the missing data

mf_map = {'male': 1, 'female': 0}
ef_map = {'S': 0, 'C': 1, 'Q': 2}

rd_titanic['who'] = rd_titanic['who'].astype('string')
rd_titanic['embark_town'] = rd_titanic['embark_town'].astype('string')
rd_titanic['sex'] = rd_titanic['sex'].map(mf_map)
rd_titanic['embarked'] = rd_titanic['embarked'].map(ef_map)
rd_titanic = rd_titanic.fillna({'age' : rd_titanic['age'].mean()})
update_data = rd_titanic.copy()
update_data.info()
display(update_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    int64  
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    float64
 8   class        891 non-null    object 
 9   who          891 non-null    string 
 10  adult_male   891 non-null    int64  
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    string 
 13  alive        891 non-null    object 
 14  alone        891 non-null    int64  
dtypes: float64(3), int64(7), object(3), string(2)
memory usage: 104.5+ KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.000000,1,0,7.2500,0.0,Third,man,1,,Southampton,no,0
1,1,1,0,38.000000,1,0,71.2833,1.0,First,woman,0,C,Cherbourg,yes,0
2,1,3,0,26.000000,0,0,7.9250,0.0,Third,woman,0,,Southampton,yes,1
3,1,1,0,35.000000,1,0,53.1000,0.0,First,woman,0,C,Southampton,yes,0
4,0,3,1,35.000000,0,0,8.0500,0.0,Third,man,1,,Southampton,no,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,0.0,Second,man,1,,Southampton,no,1
887,1,1,0,19.000000,0,0,30.0000,0.0,First,woman,0,B,Southampton,yes,1
888,0,3,0,29.699118,1,2,23.4500,0.0,Third,woman,0,,Southampton,no,0
889,1,1,1,26.000000,0,0,30.0000,1.0,First,man,1,C,Cherbourg,yes,1


### Using Three catagory of Age, Fare and Sex to predict can survived or die

In [4]:
Selected_data = update_data[["age", "fare", "sex", "survived"]]
Selected_data

Unnamed: 0,age,fare,sex,survived
0,22.000000,7.2500,1,0
1,38.000000,71.2833,0,1
2,26.000000,7.9250,0,1
3,35.000000,53.1000,0,1
4,35.000000,8.0500,1,0
...,...,...,...,...
886,27.000000,13.0000,1,0
887,19.000000,30.0000,0,1
888,29.699118,23.4500,0,0
889,26.000000,30.0000,1,1


### Use Sklearn of Standard Scaler

In [5]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
Selected_data[['age','fare']] = stdsc.fit_transform(Selected_data[['age','fare']])

Selected_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Unnamed: 0,age,fare,sex,survived
0,-0.592481,-0.502445,1,0
1,0.638789,0.786845,0,1
2,-0.284663,-0.488854,0,1
3,0.407926,0.42073,0,1
4,0.407926,-0.486337,1,0


In [6]:
from sklearn.neighbors import KNeighborsClassifier

df_x = Selected_data[['age', 'sex']]
df_y = Selected_data['survived']

# Data split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2)

In [7]:
k = 1
knn = KNeighborsClassifier(n_neighbors = k)

knn.fit(x_train, y_train)

print(knn.score(x_test, y_test))

0.5921787709497207


In [8]:
from sklearn.neighbors import KNeighborsClassifier

df_X = Selected_data[['sex','fare']]
df_Y = Selected_data['survived']

# Data split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df_X, df_Y, test_size = 0.2)

# Train for 100 times
for i in range(1, 101):
    k = i
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    print('No.', k, 'time training: ',knn.score(X_train, Y_train))

No. 1 time training:  0.8918539325842697
No. 2 time training:  0.8609550561797753
No. 3 time training:  0.8651685393258427
No. 4 time training:  0.8398876404494382
No. 5 time training:  0.8384831460674157
No. 6 time training:  0.8174157303370787
No. 7 time training:  0.8132022471910112
No. 8 time training:  0.8146067415730337
No. 9 time training:  0.8075842696629213
No. 10 time training:  0.800561797752809
No. 11 time training:  0.8019662921348315
No. 12 time training:  0.8019662921348315
No. 13 time training:  0.8089887640449438
No. 14 time training:  0.7935393258426966
No. 15 time training:  0.8047752808988764
No. 16 time training:  0.797752808988764
No. 17 time training:  0.7907303370786517
No. 18 time training:  0.7808988764044944
No. 19 time training:  0.7808988764044944
No. 20 time training:  0.7823033707865169
No. 21 time training:  0.7921348314606742
No. 22 time training:  0.7879213483146067
No. 23 time training:  0.7865168539325843
No. 24 time training:  0.7879213483146067
No.

In [9]:
from sklearn.metrics import accuracy_score

pred = knn.predict(X_test)
accuracy_score(Y_test, pred)

0.7597765363128491

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, pred)

array([[82, 21],
       [22, 54]], dtype=int64)

In [11]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79       103
           1       0.72      0.71      0.72        76

    accuracy                           0.76       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.76      0.76       179



In [12]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn, df_X, df_Y, cv = 10, scoring='accuracy')
print(scores)
print(scores.mean())

[0.8        0.78651685 0.76404494 0.84269663 0.7752809  0.75280899
 0.75280899 0.74157303 0.82022472 0.7752809 ]
0.7811235955056179


### Create some sample data to predict 

In [18]:
Rose = [[0, 1]]
Jack = [[1, 1]]
Roy = [[1, 2]]
Apple = [[0, 2]]
Peter = [[1, 3]]

In [14]:
result = knn.predict(Rose)[0]
if result == 1:
    print("Survived")
else:
    print("Die")

Survived


In [15]:
result = knn.predict(Jack)[0]
if result == 1:
    print("Survived")
else:
    print("Die")

Die


In [16]:
result = knn.predict(Roy)[0]
if result == 1:
    print("Survived")
else:
    print("Die")

Survived


In [17]:
result = knn.predict(Apple)[0]
if result == 1:
    print("Survived")
else:
    print("Die")

Survived


In [19]:
result = knn.predict(Peter)[0]
if result == 1:
    print("Survived")
else:
    print("Die")

Survived
