# 1. Data Preparation

# import packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os #packages yang digunakan untuk mengambil data data yang d butuhkan 

#import sklearn librarries

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans


# Data Overview

In [4]:
data_path = os.path.join("Titanic", "train.csv")
print(data_path)

Titanic\train.csv


In [5]:
# data = pd.read_csv("nama_data.csv")
data = pd.read_csv(data_path)

In [6]:
# check jumlah baris (records) dan jumlah column
data.shape #outputnya adalah baris dan colomn 

(891, 12)

### Keterangan data:
1. PassengerId : Id dari records/penumpang
2. Survived : Catatan bahwa dia selamat atau meninggal (1=Selamat, 0=Meninggal)
3. Pclass : Kelas dari penumpang
4. Name : Nama penumpang
5. Sex : Jenis Kelamin penumpang
6. Age : Usia penumpang
7. SibSp : Jumlah Siblings/Spouse (saudara atau pasangan)
8. Parch : Jumlah Parent/Childern (Orang tua atau anak)
9. Ticket : Nomor Tiket
10. Fare : Harga Tiket
11. Cabin : Posisi duduk/kabin penumpang
12. Embarked : Pelabuhan tempat penumpang berangkat

In [7]:
#meliahat overview top 5 teratas
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
#melihat overview top 5 terbawah
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [9]:
#melihat overview random 5 data
data.sample(5, random_state=91)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
535,536,1,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S
561,562,0,3,"Sivic, Mr. Husein",male,40.0,0,0,349251,7.8958,,S
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C


In [10]:
#melihat tipe data dari masing2 kolom dan jumlah baris atau record yang kosong
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Data Quality Check

In [11]:
#cek nilai nilai yang ada di suatu kolom 
#data['nama kolom'].unique()
data["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
#melihat persebaran data (min, max,quantile)
#data.describe() #untuk data numerik 

data.describe(include=['O']) #untuk data categorik

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [13]:
#melihat apakah ada data yang nilainya masih kosong 
data.isnull().sum() #meliat jumlah value yang kosong 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
#persentasi baris yang kosong dari keseluruhan 
data.isna().mean()*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [15]:
#drop record atau baris
data_dropped = data.dropna()
data_dropped.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [16]:
data.shape, data_dropped.shape

((891, 12), (183, 12))

In [17]:
#drop variabel atau colom 
data_dropped = data.drop(columns=['Cabin'], axis= 1)
data_dropped.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [18]:
data.shape, data_dropped.shape

((891, 12), (891, 11))

In [19]:
#mengisi record atau baris

#data_dropped = data.fillna(0)
data_dropped['Embarked_filled'] = data_dropped['Embarked'].fillna('Kosong') #atau missing 


In [20]:
data_dropped.loc[data_dropped['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_filled
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,,Kosong
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,,Kosong


In [26]:
data_dropped['Age'] = data_dropped['Age'].fillna(method='ffill')
data_dropped['Age'] = data_dropped['Age'].fillna(method='bfill')

data_dropped['Embarked']=data_dropped['Embarked'].fillna(data_dropped['Embarked'].value_counts().idxmax())


In [25]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Name             891 non-null    object 
 4   Sex              891 non-null    object 
 5   Age              891 non-null    float64
 6   SibSp            891 non-null    int64  
 7   Parch            891 non-null    int64  
 8   Ticket           891 non-null    object 
 9   Fare             891 non-null    float64
 10  Embarked         891 non-null    object 
 11  Embarked_filled  891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Handling Duplicate Column

In [21]:
#sebelumnya karena data nya sudah tidak ada yang dupplicate, kita tambahkan 1 date yang sama contohnya data name

data = data.append(data.loc[1],ignore_index=True)

In [22]:
#untuk mengecek baris duplicate 
data.loc[data['Name'].duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
891,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [23]:
#membuang data yang duplicate semuanya 

data.drop_duplicates(keep=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
#membuang baris duplicate semua, Kecuali paling atas 
data.drop_duplicates(keep='first')

#membuang baris duplicate semua, kecuali paling bawah 
data.drop_duplicates(keep='last')

#membuang baris duplicate semua, berdasarkan beberapa Column 
data.drop_duplicates(subset=['Name','Sex'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 2. finding data insight

In [25]:
#agregat data berdasarkan suatu kolom
#data.groupby('kelas').mean()[['nilai']] contohnya

#data.groupby('Embarked').mean()[['Age']]

data.groupby('Sex').mean()[['Age']]


Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.954198
male,30.726645


In [26]:
#menghitung hubungan antara dua variable
#pd.crosstab(data('kelas'), data['ikut bimble'])

pd.crosstab(data['Sex'], data['Embarked'], normalize = True) #nomalize True / value di ubah ke persentase
#pd.crosstab(data['Sex'], data['Embarked']) #value data terlalu membingungkan 

Embarked,C,Q,S
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.083146,0.040449,0.22809
male,0.106742,0.046067,0.495506


# 3 Data Manipulation

## Binning

In [27]:
#membagi data numerik sesuai quantilnya 
pd.qcut(data['Age'],5)

0       (19.0, 25.0]
1       (32.0, 41.0]
2       (25.0, 32.0]
3       (32.0, 41.0]
4       (32.0, 41.0]
           ...      
887    (0.419, 19.0]
888              NaN
889     (25.0, 32.0]
890     (25.0, 32.0]
891     (32.0, 41.0]
Name: Age, Length: 892, dtype: category
Categories (5, interval[float64, right]): [(0.419, 19.0] < (19.0, 25.0] < (25.0, 32.0] < (32.0, 41.0] < (41.0, 80.0]]

In [28]:
pd.qcut(data['Age'],5).value_counts()

(0.419, 19.0]    164
(25.0, 32.0]     145
(41.0, 80.0]     142
(19.0, 25.0]     137
(32.0, 41.0]     127
Name: Age, dtype: int64

In [29]:
#salah satu cara merubah numerik menjadi katagori
#membagi data secara custom, misalkan nilai <75 adalah nilai tidak lulus


data_dropped['is_children'] = np.where(data_dropped['Age']<17,'Yes','No')

In [30]:
#untuk mengeceknya

data_dropped.loc[data_dropped['is_children']=='Yes']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_filled,is_children
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,S,S,Yes
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C,C,Yes
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,S,S,Yes
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,S,S,Yes
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.1250,Q,Q,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.2750,S,S,Yes
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,C,C,Yes
853,854,1,1,"Lines, Miss. Mary Conover",female,16.0,0,1,PC 17592,39.4000,S,S,Yes
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,S,S,Yes


## Transform

In [31]:
#menambah kolom jumlah dari dua nilai 

data_dropped['family_size'] = (data_dropped['SibSp'] + data_dropped['Parch'])

In [32]:
#mengubah data ke bentuk lain 

data_dropped['sqrd_fare'] = data_dropped['Fare']**2
data_dropped['Log_fare'] = np.log(data_dropped['Fare'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [33]:
data_dropped.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_filled,is_children,family_size,sqrd_fare,Log_fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,S,No,1,52.5625,1.981001
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,No,1,5081.308859,4.266662
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,S,No,0,62.805625,2.070022
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,S,No,1,2819.61,3.972177
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,S,No,0,64.8025,2.085672


## Clipping 

In [34]:
#membatasi nilai dari 1-100

data_dropped['Age'] = data_dropped['Age'].clip(1,100)

In [35]:
data_dropped['Age'].describe()

count    891.000000
mean      29.584175
std       14.549419
min        1.000000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [36]:
#clipping menggunakan boxplot

Q1 = data_dropped['Fare'].quantile(0.25)
Q3 = data_dropped['Fare'].quantile(0.75)
IQR = Q3 - Q1
Lwhishker = Q1 - 1.5 * IQR
Uwhishker = Q3 + 1.5 * IQR
data_dropped['clipped_fare'] = data_dropped['Fare'].clip(Lwhishker,Uwhishker)

In [37]:
data_dropped['clipped_fare'].describe()

count    891.000000
mean      24.046813
std       20.481625
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max       65.634400
Name: clipped_fare, dtype: float64

In [38]:
data_dropped['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

## scalling

In [39]:
#import sklearn library

from sklearn.preprocessing import StandardScaler, MinMaxScaler

#StandardScaler 
scaler1 = StandardScaler()
std_scaler = scaler1.fit_transform(data_dropped[['clipped_fare']])



In [40]:
#MinMaxScaler
scaler2 = MinMaxScaler()
mm_scaler = scaler2.fit_transform(data_dropped[['clipped_fare']])

In [41]:
data_dropped['std_fare'] = std_scaler
data_dropped['mm_fare'] = mm_scaler

In [42]:
data_dropped[['clipped_fare', 'std_fare', 'mm_fare']].sample(10, random_state=91)

Unnamed: 0,clipped_fare,std_fare,mm_fare
535,26.25,0.107629,0.399943
561,7.8958,-0.789004,0.1203
377,65.6344,2.031623,1.0
636,7.925,-0.787578,0.120745
34,65.6344,2.031623,1.0
79,12.475,-0.565302,0.190068
430,26.55,0.122285,0.404513
890,7.75,-0.796127,0.118078
729,7.925,-0.787578,0.120745
287,7.8958,-0.789004,0.1203


In [43]:
data_dropped[['clipped_fare', 'std_fare', 'mm_fare']].describe()

Unnamed: 0,clipped_fare,std_fare,mm_fare
count,891.0,891.0,891.0
mean,24.046813,7.999587000000001e-17,0.366375
std,20.481625,1.000562,0.312056
min,0.0,-1.174727,0.0
25%,7.9104,-0.7882908,0.120522
50%,14.4542,-0.4686152,0.220223
75%,31.0,0.3396748,0.472313
max,65.6344,2.031623,1.0


## Endcoding

In [44]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Name             891 non-null    object 
 4   Sex              891 non-null    object 
 5   Age              891 non-null    float64
 6   SibSp            891 non-null    int64  
 7   Parch            891 non-null    int64  
 8   Ticket           891 non-null    object 
 9   Fare             891 non-null    float64
 10  Embarked         891 non-null    object 
 11  Embarked_filled  891 non-null    object 
 12  is_children      891 non-null    object 
 13  family_size      891 non-null    int64  
 14  sqrd_fare        891 non-null    float64
 15  Log_fare         891 non-null    float64
 16  clipped_fare     891 non-null    float64
 17  std_fare        

In [45]:
col = sorted(data_dropped['Sex'].unique().tolist()) + sorted(data_dropped['Embarked'].unique().tolist()) + sorted(data_dropped['is_children'].unique().tolist())

In [46]:
from sklearn.preprocessing import OneHotEncoder

# Membuat variabel enc yang berisi class OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

# mengubah value 'Sex' menjadi encoder 1-0
enc_df = pd.DataFrame(enc.fit_transform(data_dropped[['Sex', 'Embarked', 'is_children']]).toarray(), 
                      columns=col) # 

# merge with main df bridge_df on key values
hasil_df = data_dropped[['Sex', 'Embarked', 'is_children']].join(enc_df) # 
hasil_df

Unnamed: 0,Sex,Embarked,is_children,female,male,C,Q,S,No,Yes
0,male,S,No,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,female,C,No,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,female,S,No,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,female,S,No,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,male,S,No,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
886,male,S,No,0.0,1.0,0.0,0.0,1.0,1.0,0.0
887,female,S,No,1.0,0.0,0.0,0.0,1.0,1.0,0.0
888,female,S,No,1.0,0.0,0.0,0.0,1.0,1.0,0.0
889,male,C,No,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [47]:
# pandas dummies
dummies = pd.get_dummies(data_dropped[['Sex', 'Embarked', 'is_children']], drop_first=True) # 
final_data = data_dropped.join(dummies)
dummies.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,is_children_Yes
0,1,0,1,0
1,0,0,0,0
2,0,0,1,0
3,0,0,1,0
4,1,0,1,0


# 4 Modeling Preparation

In [48]:
final_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'Embarked_filled', 'is_children',
       'family_size', 'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare',
       'mm_fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'is_children_Yes'],
      dtype='object')

In [49]:
final_data = final_data.drop(['PassengerId','Name','Sex','Ticket','Embarked','Embarked_filled','is_children'], 1)

  final_data = final_data.drop(['PassengerId','Name','Sex','Ticket','Embarked','Embarked_filled','is_children'], 1)


In [50]:
#import library

from sklearn.model_selection import train_test_split

train, test = train_test_split(final_data, test_size = 0.3, random_state = 2021)

# 5 Modeling

In [51]:
train.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'family_size',
       'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare',
       'Sex_male', 'Embarked_Q', 'Embarked_S', 'is_children_Yes'],
      dtype='object')

In [70]:
from sklearn.linear_model import LinearRegression 
# Prediktor  = X , Target = Y

X_train = train.drop(['Fare', 'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare'], 1)
y_train = train['Fare']
X_test = test.drop(['Fare', 'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare'], 1)
y_test = test['Fare']

lr = LinearRegression(normalize=True)
lr.fit(X_train, y_train)

  X_train = train.drop(['Fare', 'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare'], 1)
  X_test = test.drop(['Fare', 'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare'], 1)


LinearRegression(normalize=True)

In [53]:
?LinearRegression 
#merubah ubah hyper parameter

In [54]:
X_test[2:3]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,family_size,Sex_male,Embarked_Q,Embarked_S,is_children_Yes
666,0,2,25.0,0,0,0,1,0,1,0


In [55]:
lr.predict(X_test[2:3])

array([30.91559742])

In [56]:
train.columns


Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'family_size',
       'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare',
       'Sex_male', 'Embarked_Q', 'Embarked_S', 'is_children_Yes'],
      dtype='object')

In [72]:
from sklearn.tree import DecisionTreeClassifier
#predik X, target y

X_train = train.drop(['Survived', 'Fare', 'sqrd_fare', 'Log_fare', 'std_fare'],1)
y_train = train['Survived']
X_test = test.drop(['Survived', 'Fare', 'sqrd_fare', 'Log_fare', 'std_fare'],1)
y_test = test['Survived']

dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=2021)
dt.fit(X_train, y_train)

  X_train = train.drop(['Survived', 'Fare', 'sqrd_fare', 'Log_fare', 'std_fare'],1)
  X_test = test.drop(['Survived', 'Fare', 'sqrd_fare', 'Log_fare', 'std_fare'],1)


DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=2021)

In [58]:
?DecisionTreeClassifier 
#merubah ubah hyper paramater 

In [59]:
X_test[2:3]

Unnamed: 0,Pclass,Age,SibSp,Parch,family_size,clipped_fare,mm_fare,Sex_male,Embarked_Q,Embarked_S,is_children_Yes
666,2,25.0,0,0,0,13.0,0.198067,1,0,1,0


In [60]:
dt.predict(X_test[2:3])

array([0], dtype=int64)

In [61]:
train.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'family_size',
       'sqrd_fare', 'Log_fare', 'clipped_fare', 'std_fare', 'mm_fare',
       'Sex_male', 'Embarked_Q', 'Embarked_S', 'is_children_Yes'],
      dtype='object')

In [74]:
from sklearn.cluster import KMeans
#prediktor X

X_train = train.drop(['Survived', 'Log_fare', 'std_fare'], 1)
X_test = test.drop(['Survived', 'Log_fare', 'std_fare'], 1)

Kmeans = KMeans(n_clusters=5,max_iter=100)
Kmeans.fit(X_train)

  X_train = train.drop(['Survived', 'Log_fare', 'std_fare'], 1)
  X_test = test.drop(['Survived', 'Log_fare', 'std_fare'], 1)


KMeans(max_iter=100, n_clusters=5)

In [None]:
?KMeans
#merubah ubah hyper parameter

In [63]:
X_train[2:3]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,family_size,sqrd_fare,clipped_fare,mm_fare,Sex_male,Embarked_Q,Embarked_S,is_children_Yes
883,2,28.0,0,0,10.5,0,110.25,10.5,0.159977,1,0,1,0


In [64]:
Kmeans.predict(X_train[2:3])

array([0])

# 6 Data Evaluation

In [68]:
#regression menggunakan mean_squared_error
#classification menggunakan accuracy_score, precision_score, recall_score
#KMeans menggunakan silhouette_score
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, silhouette_score



In [71]:
#regresion
y_pred = lr.predict(X_test)
mean_squared_error(y_test, y_pred)

1442.4150849173348

In [73]:
#classification
y_pred = dt.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(acc, prec, recall)

0.7873134328358209 0.7875 0.6116504854368932


In [77]:
#KMeans / Clustering
silhouette_score(X_train, Kmeans.labels_)

0.8943843094223066