In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

# SERIES

In [2]:
# Membuat series

angka1= [1,2,3]
nama_angka1= ['satu', 'dua', 'tiga']

# Series dengan indeks default, 0 1 2 ...
series1= pd.Series(data=angka1)

# Series dengan indeks yang dimodifikasi, jadi kaya dictionary
series2= pd.Series(data=angka1, index=nama_angka1)

# Bisa juga series1.index= nama_angka1

In [3]:
series1

0    1
1    2
2    3
dtype: int64

In [4]:
series2

satu    1
dua     2
tiga    3
dtype: int64

In [5]:
# Membuat series dari dictionary
dict1= {'empat':4, 'lima':5, 'enam':6}
pd.Series(dict1)

empat    4
lima     5
enam     6
dtype: int64

In [6]:
# Mengubah series menjadi array, cukup menambahkan .values
series1_to_array= series1.values
series1_to_array

array([1, 2, 3], dtype=int64)

# DATA FRAME

In [7]:
# Membuat data frame
# Data digenerate oleh numpy random, dengan arg 2 dan 3 adalah nama baris dan kolom
df1= pd.DataFrame(np.random.rand(3,4), ['A', 'B', 'C'], ['Col 1', 'Col 2', 'Col 3', 'Col 4'])

In [8]:
df1

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
A,0.034172,0.589131,0.15981,0.777023
B,0.791108,0.082328,0.811851,0.317126
C,0.166927,0.279908,0.780668,0.999174


In [9]:
# Menambah kolom baru berdasarkan hasil pertambahan col 3 dan col 2
df1['Col baru']= df1['Col 3']+df1['Col 2']

In [10]:
df1

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col baru
A,0.034172,0.589131,0.15981,0.777023,0.748941
B,0.791108,0.082328,0.811851,0.317126,0.894179
C,0.166927,0.279908,0.780668,0.999174,1.060577


In [11]:
# Menghapus kolom atau baris

# Menghapus baris B
df1= df1.drop('B')
# Menghapus kolom Col 2
df1.drop('Col 2', axis=1, inplace=True)
df1

Unnamed: 0,Col 1,Col 3,Col 4,Col baru
A,0.034172,0.15981,0.777023,0.748941
C,0.166927,0.780668,0.999174,1.060577


In [12]:
# Membuat tabel boolean jika angka lebih besar dari 0.5
df1>0.5

Unnamed: 0,Col 1,Col 3,Col 4,Col baru
A,False,False,True,True
C,False,True,True,True


In [13]:
# Menghilangkan data yang nilainya lebih kecil dari 0.5
df1[df1>0.5]

Unnamed: 0,Col 1,Col 3,Col 4,Col baru
A,,,0.777023,0.748941
C,,0.780668,0.999174,1.060577


In [14]:
# Hanya menampilkan kolom tertentu yang memiliki data bernilai < 0.5
df1= pd.DataFrame(np.random.rand(3,4), ['A', 'B', 'C'], ['Col 1', 'Col 2', 'Col 3', 'Col 4'])
df1[df1['Col 2']<0.5]

Unnamed: 0,Col 1,Col 2,Col 3,Col 4


# DATA SET TITANIC

In [15]:
df_titanic= pd.read_csv('titanic_data_set.csv')

# Memilah kolom apa yang akan digunakan
df_titanic= df_titanic[['survived', 'sex', 'age']]

df_titanic

Unnamed: 0,survived,sex,age
0,0,male,22.0
1,1,female,38.0
2,1,female,26.0
3,1,female,35.0
4,0,male,35.0
...,...,...,...
886,0,male,27.0
887,1,female,19.0
888,0,female,
889,1,male,26.0


In [16]:
# Mengecek setiap data df_titanic bernilai NaN atau tidak
df_titanic.isna()

Unnamed: 0,survived,sex,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
886,False,False,False
887,False,False,False
888,False,False,True
889,False,False,False


In [17]:
# Mengecek kolom df_titanic satu2 apakah mempunyai NaN
df_titanic.isna().sum()

survived      0
sex           0
age         177
dtype: int64

In [18]:
# Cara melihat jumlah nilai tidak NaN suatu kolom tertentu, misal age
age= df_titanic['age']
age.count(), age.nunique(dropna=True) # <-- menghitung berapa data unik

(714, 88)

In [19]:
# Untuk data lebih lengkapnya
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [20]:
# Memangkas baris yang bernilai NaN, menyisakan baris yang mempunyai nilai
age_cleaned= age.dropna()
age_cleaned

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
885    39.0
886    27.0
887    19.0
889    26.0
890    32.0
Name: age, Length: 714, dtype: float64

In [21]:
# Menghapus satu data pada suatu baris berarti menghapus keseluruhan baris
# Misal, menghapus baris yang punya data NaN pada df_titanic
df_titanic_dropna= df_titanic.dropna()
df_titanic_dropna

Unnamed: 0,survived,sex,age
0,0,male,22.0
1,1,female,38.0
2,1,female,26.0
3,1,female,35.0
4,0,male,35.0
...,...,...,...
885,0,female,39.0
886,0,male,27.0
887,1,female,19.0
889,1,male,26.0


In [22]:
# Tapi menghapus keseluruhan baris menimbulkan kerugian karena ada data yg hilang, hanya karena satu nilai kolom NaN
# Cara mengatasinya adalah mengisinya dengan rata-rata
df_titanic_fillna= df_titanic
df_titanic_fillna.age= df_titanic_fillna.age.fillna(value=df_titanic_fillna.age.mean())
df_titanic_fillna

Unnamed: 0,survived,sex,age
0,0,male,22.000000
1,1,female,38.000000
2,1,female,26.000000
3,1,female,35.000000
4,0,male,35.000000
...,...,...,...
886,0,male,27.000000
887,1,female,19.000000
888,0,female,29.699118
889,1,male,26.000000


In [41]:
# Mengecek jika terdapat null
df_titanic_fillna.isna().sum()

survived    0
sex         0
age         0
dtype: int64

In [25]:
df_titanic_fillna.groupby(['sex', 'survived'])['age'].sum()

sex     survived
female  0            2107.885000
        1            6752.168235
male    0           14590.004706
        1            3011.855882
Name: age, dtype: float64

In [42]:
# Mengubah semua age menjadi float
df_titanic_fillna['age']= df_titanic_fillna['age'].astype(float)

### -- Tes Arules --

In [27]:
temp= df_titanic_fillna['age']

In [43]:
# Membuat fungsi encode
def hot_encode(x):
    if(x<=12):
        return 0
    else:
        return 1

def sex_encode(x):
    if(x=='female'):
        return 0
    else:
        return 1

In [29]:
df_titanic_fillna_age_encode= df_titanic_fillna[['age']].applymap(hot_encode)

In [30]:
df_titanic_fillna['age']= df_titanic_fillna_age_encode


In [31]:
df_titanic_fillna['age']= df_titanic_fillna['age'].astype('str')
df_titanic_fillna

Unnamed: 0,survived,sex,age
0,0,male,1
1,1,female,1
2,1,female,1
3,1,female,1
4,0,male,1
...,...,...,...
886,0,male,1
887,1,female,1
888,0,female,1
889,1,male,1


In [32]:
df_titanic_fillna['sex']= df_titanic_fillna[['sex']].applymap(sex_encode)

In [33]:
df_titanic_fillna['age']= df_titanic_fillna['sex'].astype('int')

In [34]:
df_titanic_fillna

Unnamed: 0,survived,sex,age
0,0,1,1
1,1,0,0
2,1,0,0
3,1,0,0
4,0,1,1
...,...,...,...
886,0,1,1
887,1,0,0
888,0,0,0
889,1,1,1


In [35]:
df_rule= df_titanic_fillna.groupby(['sex','age'])['survived'].sum().unstack()

In [36]:
df_rule

age,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,233.0,
1,,109.0


In [37]:
frq_items= apriori(df_titanic_fillna, min_support=0.05, use_colnames=True)

# Mengumpulkan aturan yang disimpulkan dalam data frame
arules= association_rules(frq_items, metric='lift', min_threshold=1)
arules.sort_values(['confidence','lift'], ascending=[False,False])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(age),(sex),0.647587,0.647587,0.647587,1.0,1.544194,0.228218,inf
1,(sex),(age),0.647587,0.647587,0.647587,1.0,1.544194,0.228218,inf
2,"(age, survived)",(sex),0.122334,0.647587,0.122334,1.0,1.544194,0.043112,inf
3,"(sex, survived)",(age),0.122334,0.647587,0.122334,1.0,1.544194,0.043112,inf
4,(age),"(sex, survived)",0.647587,0.122334,0.122334,0.188908,1.544194,0.043112,1.082079
5,(sex),"(age, survived)",0.647587,0.122334,0.122334,0.188908,1.544194,0.043112,1.082079
