In [3]:
import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0] = None

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [8]:
# Filtering Out Missing Data

from numpy import nan as NA

In [9]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3]])

In [13]:
cleaned = data.dropna()

In [14]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
# Passing how='all' will ONLY DROP rows that are ALL NA
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
# To drop columns in the same way, pass axis=1
data[4] = NA

In [18]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
# This keeps only rows containing a certain number of observations

df = pd.DataFrame(np.random.randn(7,3))

In [21]:
df.iloc[:4, 1] = NA

In [22]:
df.iloc[:2, 2] = NA

In [23]:
df

Unnamed: 0,0,1,2
0,-0.03832,,
1,1.854232,,
2,-0.570343,,1.641539
3,-0.567114,,-1.295064
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [24]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [25]:
# This thresh=2 argument means that beyond 2 NaN, the rows is rejected

df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.570343,,1.641539
3,-0.567114,,-1.295064
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [26]:
# Filling In Missing Data

df.fillna(0)

Unnamed: 0,0,1,2
0,-0.03832,0.0,0.0
1,1.854232,0.0,0.0
2,-0.570343,0.0,1.641539
3,-0.567114,0.0,-1.295064
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [27]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.03832,0.5,0.0
1,1.854232,0.5,0.0
2,-0.570343,0.5,1.641539
3,-0.567114,0.5,-1.295064
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [28]:
_ = df.fillna(0, inplace=True)

In [29]:
df

Unnamed: 0,0,1,2
0,-0.03832,0.0,0.0
1,1.854232,0.0,0.0
2,-0.570343,0.0,1.641539
3,-0.567114,0.0,-1.295064
4,-0.585076,-0.038645,-0.23436
5,-0.728179,0.105967,0.310833
6,0.426283,0.732694,0.858008


In [30]:
# Interpolation methods available for reindexing can be used with fillna

df = pd.DataFrame(np.random.randn(6, 3))

In [31]:
df.iloc[2:, 1] = NA

In [32]:
df.iloc[4:, 2] = NA

In [33]:
df

Unnamed: 0,0,1,2
0,-0.464757,0.134573,-0.048866
1,-0.812865,-0.382658,0.622528
2,-0.34886,,-0.009935
3,-2.046832,,1.654908
4,-1.562331,,
5,0.269263,,


In [34]:
df.fillna(method='ffill')

# this method will copy the last row's data into remaining NaN

Unnamed: 0,0,1,2
0,-0.464757,0.134573,-0.048866
1,-0.812865,-0.382658,0.622528
2,-0.34886,-0.382658,-0.009935
3,-2.046832,-0.382658,1.654908
4,-1.562331,-0.382658,1.654908
5,0.269263,-0.382658,1.654908


In [36]:
df.fillna(method='ffill', limit=2)

# this will limit just 2 cells that are copied the last row's data

Unnamed: 0,0,1,2
0,-0.464757,0.134573,-0.048866
1,-0.812865,-0.382658,0.622528
2,-0.34886,-0.382658,-0.009935
3,-2.046832,-0.382658,1.654908
4,-1.562331,,1.654908
5,0.269263,,1.654908


In [37]:
# with fillna, you can also customize it
# maybe assign it with mean or median value?

data = pd.Series([1., NA, 3.5, NA, 7])

In [38]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [39]:
# DATA TRANSFORMATION
# Removing Duplicates

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1,1,2,3,3,4,4]})

In [40]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [41]:
data.duplicated()

# DataFrame method duplicated returns Series boolean yang ngestate apakah suatu row itu duplikat (telah ada datanya di baris sebelumnya) apa belum
# Maksudnya duplikat itu semua selnya duplikat, bukan cuma 1

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [42]:
data.drop_duplicates()

# drop_duplicates() returns DataFrame ketika duplicated array is False
# atau ngembaliin data yang nggak duplikat

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
data['v1'] = range(7)

# dapat specify subsetnya untuk mendeteksi duplikat

In [46]:
data.drop_duplicates(['k1'])

# duplicated() dan drop_duplicates() by default akan nge-keep first observed data-combination

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [47]:
# pengen ngekeep yg akhir? Bisa pake argumen keep='last'

data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [49]:
# Transforming Data Using a Function or Mapping

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4,3,12,6,7.5,8,3,5,6]})

In [50]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [51]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [52]:
lowercased = data['food'].str.lower()

In [53]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [54]:
data['animal'] = lowercased.map(meat_to_animal)

In [55]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [56]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [57]:
# Replacing Values

# replace method is simple method to be used for replacing
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [58]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [59]:
# replacing -999 with NaN

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [60]:
# replacing -999 and -1000 (multiple values) with NaN

data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [61]:
# want different placements for each value? pass a list of subtitutes

data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [62]:
# atau juga assigningnya bisa pake dict

data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [4]:
# Renaming Axis Indexes

# axis labels dapat diubah dengan fungsi dan mapping
import pandas as pd
import numpy as np

data = pd.DataFrame(np.arange(12).reshape((3,4)),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [14]:
# index axis punya method map

transform = lambda x: x[:4].upper()

data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colo,4,5,6,7
new,8,9,10,11


In [15]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [16]:
# bisa assign ke index, modifikasi DataFrame

data.index = data.index.map(transform)

In [17]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [18]:
# bisa buat versi transform dataset tanpa modifikasi original
# pake rename method

data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [19]:
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

# rename method bisa pake dict-like object untuk assigning new values for a subset of axis labels

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [22]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

# rename saves you from the chore of copying DataFrame manually dan assigning ke atribut index dan kolom manually

data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [23]:
# Discretization and Binning

# Data yg kontinu biasanya didiskritkan atau dikategorikan untuk analisis
# misal data umur

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [24]:
bins = [18, 25, 35, 60, 100]

In [25]:
cats = pd.cut(ages, bins)

In [26]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [27]:
# like mathematical notation, () means exclusive, and [] means inclusive
# right=False will change it to left-inclusive

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [28]:
# you can assign bin names by using a list or array to the labels

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [29]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [30]:
# Kalo dikasihnya nomor daripada pinggiran data untuk ngecut
# hasilnya adalah ___ bins equal-length dari nilai minimum dan maksimum data

data = np.random.rand(20)

In [33]:
data

array([0.9526273 , 0.3286487 , 0.82889313, 0.08868529, 0.77140837,
       0.35862749, 0.43122823, 0.99907072, 0.45593272, 0.64122102,
       0.50611549, 0.07296104, 0.20407032, 0.6212408 , 0.38423502,
       0.18623672, 0.51850604, 0.6018634 , 0.14479717, 0.69008331])

In [32]:
pd.cut(data, 4, precision=2)

# precision itu berapa desimal

[(0.77, 1.0], (0.3, 0.54], (0.77, 1.0], (0.072, 0.3], (0.77, 1.0], ..., (0.072, 0.3], (0.3, 0.54], (0.54, 0.77], (0.072, 0.3], (0.54, 0.77]]
Length: 20
Categories (4, interval[float64]): [(0.072, 0.3] < (0.3, 0.54] < (0.54, 0.77] < (0.77, 1.0]]

In [34]:
# qcut function bakal bins data berdasarkan kuantil sampel
# bins bakal equal-sized

data = np.random.randn(1000) # random, normal distributed

In [35]:
cats = pd.cut(data, 4) # cut into quartiles

In [36]:
cats

[(0.0968, 1.692], (-3.1, -1.498], (-1.498, 0.0968], (0.0968, 1.692], (-1.498, 0.0968], ..., (0.0968, 1.692], (-1.498, 0.0968], (0.0968, 1.692], (-1.498, 0.0968], (-3.1, -1.498]]
Length: 1000
Categories (4, interval[float64]): [(-3.1, -1.498] < (-1.498, 0.0968] < (0.0968, 1.692] < (1.692, 3.287]]

In [37]:
pd.value_counts(cats)

(-1.498, 0.0968]    465
(0.0968, 1.692]     399
(1.692, 3.287]       71
(-3.1, -1.498]       65
dtype: int64

In [38]:
# you can assign your own quantiles

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.0355, 1.495], (-3.094, -1.293], (-1.293, 0.0355], (0.0355, 1.495], (-1.293, 0.0355], ..., (0.0355, 1.495], (-1.293, 0.0355], (0.0355, 1.495], (-1.293, 0.0355], (-3.094, -1.293]]
Length: 1000
Categories (4, interval[float64]): [(-3.094, -1.293] < (-1.293, 0.0355] < (0.0355, 1.495] < (1.495, 3.287]]

In [43]:
# Detecting and Filtering Outliers

data = pd.DataFrame(np.random.randn(1000, 4))

# 4 kolom, 1000 nomor, distribusi normal

data

Unnamed: 0,0,1,2,3
0,-0.926973,0.207367,0.549086,1.884033
1,0.718042,-0.089185,-0.800020,-0.232111
2,0.691299,-2.068669,1.102383,0.818748
3,0.832482,1.525594,-1.031306,0.952033
4,0.015755,-1.234702,1.493978,-1.700837
...,...,...,...,...
995,0.963936,0.196278,-1.970391,-0.471650
996,-0.648963,1.062454,-0.760653,-0.223032
997,0.241496,0.502883,0.192990,0.627125
998,0.248785,0.663965,-0.756765,-2.180937


In [40]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.02487,0.059652,0.042599,-0.002779
std,1.002692,1.004919,1.024832,1.051562
min,-3.378271,-3.148188,-3.338094,-3.154163
25%,-0.736319,-0.60165,-0.638333,-0.685201
50%,-0.037755,-0.00114,0.058122,0.01079
75%,0.721574,0.770641,0.755681,0.654587
max,3.261541,3.124021,3.297298,3.398647


In [41]:
col = data[2]

# choosing column 2

In [42]:
col[np.abs(col) > 3]

# find absolute values in column 2 above 3

113    3.249298
146   -3.338094
169    3.018844
475    3.297298
Name: 2, dtype: float64

In [44]:
data[(np.abs(data) > 3).any(1)]

# selecting all rows containing value exceeding 3 or -3

Unnamed: 0,0,1,2,3
40,-1.162077,-3.09569,-0.165641,0.226969
241,-3.453861,0.185269,0.902061,-1.193072
390,1.235325,3.07307,-0.5178,0.625478
437,3.616503,-0.540603,-0.796772,-0.003783
455,-0.120179,-0.876168,-3.075579,-0.75324
511,1.333852,-3.314655,-1.912709,1.165678
554,-0.519421,-4.274155,1.526827,0.718513
591,3.024579,0.961791,0.357967,0.907633
738,2.01447,0.029801,3.043716,-0.353981


In [49]:
data[np.abs(data) > 3] = np.sign(data) * 3

# np.sign(data) will result 1 and -1 values based on whether the values in data are positive or negative
# cap values outside the internal -3 to 3
# code ini bakal ngeganti semua values yg outside interval -3 sampe 3 jadi masing2 -3 sama 3

In [46]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.065516,0.018549,-0.030417,-0.063819
std,0.962365,0.973023,0.975324,1.021392
min,-3.0,-3.0,-3.0,-2.869363
25%,-0.576634,-0.609778,-0.68883,-0.774523
50%,0.029043,0.050062,-0.038552,-0.062535
75%,0.709983,0.684876,0.639152,0.641768
max,3.0,3.0,3.0,2.622594


In [51]:
np.sign(data).head()

# np.sign(data) will result 1 and -1 values based on whether the values in data are positive or negative

Unnamed: 0,0,1,2,3
0,-1.0,1.0,1.0,1.0
1,1.0,-1.0,-1.0,-1.0
2,1.0,-1.0,1.0,1.0
3,1.0,1.0,-1.0,1.0
4,1.0,-1.0,1.0,-1.0


In [52]:
# Permutation and Random Sampling

# Permuting (randomly reordering) a Series or the rows in a DataFrame mudah dilakukan dengan fungsi numpy.random.permutation

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [53]:
sampler = np.random.permutation(5)

# random number yang merupakan indeks dari row di DataFrame ini yang akan menjadikan dasar 'randomisasi' row di DataFrame

In [54]:
sampler

array([4, 0, 3, 2, 1])

In [55]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [56]:
# randomisasi baris di DataFrame dg assigning rows bds random row index generated by Numpy

df.take(sampler)

# atau juga bisa dilakukan iloc-based indexing selain fungsi take

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7


In [57]:
# selecting random subset without replacement -> sample method

df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
1,4,5,6,7


In [58]:
choices = pd.Series([5, 7, -1, 6, 4])

In [59]:
# generate a subset with replacement, pass replace=True to sample

draws = choices.sample(n=10, replace=True)

In [60]:
draws

0    5
0    5
2   -1
0    5
2   -1
1    7
0    5
2   -1
1    7
2   -1
dtype: int64

In [61]:
# Computing Indicator / Dummy Variables

# converting a categorical variable into a 'dummy' or 'indicator' matrix
# maksudnya apakah suatu data dengan indikator yg sama ada pada suatu row apa nggak

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [62]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [65]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [66]:
pd.get_dummies(df['data1'])

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,1,0,0,0
3,0,0,0,1,0,0
4,0,0,0,0,1,0
5,0,0,0,0,0,1


In [67]:
# prefix untuk indeks row/column ada di argumen get_dummies

dummies = pd.get_dummies(df['key'], prefix='key')

In [68]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [69]:
df_with_dummy = df[['data1']].join(dummies)

In [71]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [72]:
# example in this

mnames = ['movie_id', 'title', 'genres']

In [73]:
movies = pd.read_table('datasets/movies.dat', sep='::',
                      header=None, names=mnames)

  


In [74]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [75]:
# adding indicator variables for each genre

all_genres = []

In [76]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [77]:
genres = pd.unique(all_genres)

In [78]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [79]:
# one way to construct indicator DataFrame is to start with a DataFrame of all zeros

zero_matrix = np.zeros((len(movies), len(genres)))

In [80]:
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [82]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
gen = movies.genres[0]

In [85]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [86]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [88]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [92]:
# combine this with movies

movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [97]:
movies_windic.iloc[0]

# HARUSNYA INI BISA DILANJUTIN

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [98]:
# for statistical application, you can combine get_dummies with discrelization function like cut

np.random.seed(12345)

In [99]:
values = np.random.rand(10)

In [100]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [101]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [102]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [110]:
# STRING MANIPULATION

# String Object Methods
# built-in python string methods are sufficient for many applications

val = 'a,b,  guido'

In [119]:
val.split(',')

['a', 'b', '  guido']

In [120]:
# strip untuk trim whitespace (including line breaks)

pieces = [x.strip() for x in val.split(',')]

In [121]:
pieces

['a', 'b', 'guido']

In [122]:
# substrings dapat digabungkan dengan two-colon delimiter dengan adisi

first, second, third = pieces

In [123]:
first + '::' + second + '::' + third

'a::b::guido'

In [124]:
# atau juga dapat dijoin dengan string '::'

'::'.join(pieces)

'a::b::guido'

In [125]:
# in keyword is the best way to detect a substring
# index dan find juga bisa digunain

'guido' in val

True

In [126]:
val.index(',')

# index returns 1 jika ditemukan, tapi Error jika tidak

1

In [130]:
val.find(':')

# find returns 1 jika ditemukan, -1 jika tidak ditemukan

-1

In [131]:
val.count(',')

# count returns the number of occurences of a particular substring

2

In [132]:
val.replace(',', '::')

# untuk replace sesuatu dengan sesuatu

'a::b::  guido'

In [133]:
val.replace('  ', '')

'a,b,guido'

In [134]:
# ADAAA BANYAK METHODs

In [135]:
# Regular Expressions

# for search or match string patterns in a text

# re module is for applying regular expression

# 3 categories in re module: pattern matching, substitution, and splitting

import re

In [136]:
text = "foo    bar\t baz  \tqux"

In [137]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [138]:
regex = re.compile('\s+')

# regex object can be reusable

In [140]:
regex.split(text)

# split will split the string into a list

['foo', 'bar', 'baz', 'qux']

In [141]:
regex.findall(text)

# to get list of all patterns matching the regex

# findall returns all matches in a string

['    ', '\t ', '  \t']

In [142]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [143]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [144]:
# re.IGNORECASE makes the regex case-insensitive

regex = re.compile(pattern, flags=re.IGNORECASE)

In [145]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [146]:
m = regex.search(text)

# search will return only the first match

In [147]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [148]:
text[m.start():m.end()]

'dave@google.com'

In [149]:
print(regex.match(text))

None


In [150]:
print(regex.sub('REDACTED', text))

# sub will return new string if certain patterns are met

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [151]:
# suppose lu mau nemuin email addresses dan pisahin username, domain name, dan suffixnya

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [152]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [153]:
m = regex.match('wesm@bright.net')

In [154]:
m.groups()

('wesm', 'bright', 'net')

In [157]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [156]:
# regex punya akses ke grup untuk setiap match dengan simbol seperti \1 dan \2
# \1 -> first matched group, \2 to the second, dst

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [158]:
# Vectorized String Functions in pandas

data = {'Dave': 'dave.google.com', 'Steve': 'steve@gmail.com',
       'Rob': 'rob@gmail.com', 'Wes':np.nan}

In [159]:
data = pd.Series(data)

In [160]:
data

Dave     dave.google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [161]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [162]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [163]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [164]:
data.str.findall(pattern, flags=re.IGNORECASE)

# find yang memenuhi pattern, dan ignore case

Dave                        []
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [166]:
matches = data.str.findall(pattern, flags=re.IGNORECASE)

In [167]:
matches

Dave                        []
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [168]:
# str.get() atau index ke str attribute untuk retrieve

matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [169]:
matches.str[0]

Dave                     NaN
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [170]:
data.str[:5]

Dave     dave.
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object