In [None]:
import numpy as np

# 행렬의 모든 요소는 같은 데이터 타입
matrix = np.array([['1','2','3','4','5'], [6,7,8,9,10]])
print(matrix)

[['1' '2' '3' '4' '5']
 ['6' '7' '8' '9' '10']]


In [None]:
import pandas as pd

# 데이터 프레임 생성
df = pd.DataFrame({
    'col1':['one','two','three', 'four', 'five'],
    'col2':[6,7,8,9,10]
})
print(df)

    col1  col2
0    one     6
1    two     7
2  three     8
3   four     9
4   five    10


In [None]:
# 데이터 타입 확인
print(df['col1'].dtype)
print(df['col2'].dtype)

object
int64


In [None]:
# 빈 데이터 프레임 만들기 예제

my_df = pd.DataFrame({
    '실수':pd.Series(dtype = 'float'),
    '정수':pd.Series(dtype = 'int'),
    '범주형':pd.Series(dtype = 'category'),
    '논리':pd.Series(dtype='bool'),
    '문자열':pd.Series(dtype='str')
})
print(my_df)
print(my_df.dtypes)

Empty DataFrame
Columns: [실수, 정수, 범주형, 논리, 문자열]
Index: []
실수      float64
정수        int64
범주형    category
논리         bool
문자열      object
dtype: object


In [None]:
# 채워진 데이터 프레임 생성
my_df = pd.DataFrame({
    'name' : ['issac', 'bomi'],
    'birthmonth' : [5,4]
})
print(my_df)
print(my_df.dtypes)

    name  birthmonth
0  issac           5
1   bomi           4
name          object
birthmonth     int64
dtype: object


In [None]:
# URL을 사용하여 바로 읽어오기
url = "https://raw.githubusercontent.com/YoungjinBD/data/main/examscore.csv"
mydata = pd.read_csv(url)

# 데이터의 위쪽 행들을 확인
print(mydata.head())

# 데이터 프레임의 차원(행과 열 개수)을 출력
print(mydata.shape)

   student_id gender  midterm  final
0           1      F       38     46
1           2      M       42     67
2           3      F       53     56
3           4      M       48     54
4           5      M       46     39
(30, 4)


In [None]:
my_df['name']

Unnamed: 0,name
0,issac
1,bomi


In [None]:
my_df[['name', 'birthmonth']]

Unnamed: 0,name,birthmonth
0,issac,5
1,bomi,4


In [None]:
# iloc[]을 이용한 필터링 => Integer Location

# 첫 번째 열에 접근
my_df.iloc[:,0]

Unnamed: 0,name
0,issac
1,bomi


In [None]:
# 두 번째 행, 첫 번째 열에 접근
my_df.iloc[1,0]

'bomi'

In [None]:
# loc[]를 이용한 필터링

mydata[mydata['midterm'] <= 15]

Unnamed: 0,student_id,gender,midterm,final
19,20,M,9,33
21,22,M,15,12


In [None]:
# loc 활용
mydata.loc[mydata['midterm'] <= 15]  # 위와 결과 같음 (특정 열을 지정하지 않을 경우 전체 열이 선택됩니다)

Unnamed: 0,student_id,gender,midterm,final
19,20,M,9,33
21,22,M,15,12


In [None]:
mydata.loc[mydata['midterm'] <= 15, :]

Unnamed: 0,student_id,gender,midterm,final
19,20,M,9,33
21,22,M,15,12


In [None]:
mydata.loc[mydata['midterm']<=15, ['student_id', 'final']]

Unnamed: 0,student_id,final
19,20,33
21,22,12


In [None]:
mydata[mydata['midterm'].isin([28,38,52])].head()

Unnamed: 0,student_id,gender,midterm,final
0,1,F,38,46
8,9,M,28,25
9,10,M,38,59
23,24,M,28,55
27,28,F,52,66


In [None]:
mydata.loc[mydata['midterm'].isin([28,38,52]), ['student_id', 'final']].head()

Unnamed: 0,student_id,final
0,1,46
8,9,25
9,10,59
23,24,55
27,28,66


In [None]:
# isin() 앞에 ~을 추가하면 특정 값을 제외할 수 있습니다.

mydata.loc[~mydata['midterm'].isin([28,38,52])].head()

Unnamed: 0,student_id,gender,midterm,final
1,2,M,42,67
2,3,F,53,56
3,4,M,48,54
4,5,M,46,39
5,6,M,51,74


In [None]:
import pandas as pd
import numpy as np

# 예제 데이터 프레임 생성

mydata = pd.DataFrame({
    'student_id': [1,2,3,4,5],
    'gender': ['F', 'M', 'F', 'M', 'M'],
    'midterm': [38,42,53,48,46],
    'final': [46,67,56,54,39]
})

# 일부 데이터를 NA로 설정
mydata.iloc[0,1] = np.nan
mydata.iloc[4,0] = np.nan
print(mydata.head())

   student_id gender  midterm  final
0         1.0    NaN       38     46
1         2.0      M       42     67
2         3.0      F       53     56
3         4.0      M       48     54
4         NaN      M       46     39


In [None]:
# isna() 함수를 활용하여 NA가 있는 열의 개수 찾기

print("gender 열의 빈칸 개수:", mydata['gender'].isna().sum())
print("student_id 열의 빈칸 개수:", mydata['student_id'].isna().sum())

gender 열의 빈칸 개수: 1
student_id 열의 빈칸 개수: 1


In [None]:
complete_row = mydata.dropna()
print("완전한 행의 수:", len(complete_row))
print(complete_row)

완전한 행의 수: 3
   student_id gender  midterm  final
1         2.0      M       42     67
2         3.0      F       53     56
3         4.0      M       48     54


In [None]:
mydata['total'] = mydata['midterm'] + mydata['final']
print(mydata.iloc[0:3, [3,4]])

   final  total
0     46     84
1     67    109
2     56    109


In [None]:
# pd.concat() 함수를 활용하여 열을 추가하자

mydata = pd.concat([mydata, (mydata['total'] / 2).rename('average')], axis= 1)
print(mydata.head())

   student_id gender  midterm  final  total  average
0         1.0    NaN       38     46     84     42.0
1         2.0      M       42     67    109     54.5
2         3.0      F       53     56    109     54.5
3         4.0      M       48     54    102     51.0
4         NaN      M       46     39     85     42.5


In [None]:
mydata.rename(columns = {'average':'my_average'}, inplace=True)
print(mydata.head())

   student_id gender  midterm  final  total  my_average
0         1.0    NaN       38     46     84        42.0
1         2.0      M       42     67    109        54.5
2         3.0      F       53     56    109        54.5
3         4.0      M       48     54    102        51.0
4         NaN      M       46     39     85        42.5


In [None]:
del mydata['gender']
print(mydata.head())

   student_id  midterm  final  total  my_average
0         1.0       38     46     84        42.0
1         2.0       42     67    109        54.5
2         3.0       53     56    109        54.5
3         4.0       48     54    102        51.0
4         NaN       46     39     85        42.5


In [None]:
import pandas as pd

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
})

df2 = pd.DataFrame({
    'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5']
})

result = pd.concat([df1, df2])
print(result)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
0  A3  B3
1  A4  B4
2  A5  B5


In [None]:
df3 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})

result = pd.concat([df1, df3], axis=1)
print(result)

    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2


In [None]:
result = pd.concat([df1, df2])
print(result)

result = pd.concat([df1, df2], ignore_index = True)
print(result)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
0  A3  B3
1  A4  B4
2  A5  B5
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
5  A5  B5


In [None]:
df4 = pd.DataFrame({
    'A':['A2','A3','A4'],
    'B':['B2','B3','B4'],
    'C':['C2','C3','C4']
})
print(df1)
print(df4)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
    A   B   C
0  A2  B2  C2
1  A3  B3  C3
2  A4  B4  C4


In [None]:
# 공통된 열만 합치기
result = pd.concat([df1, df4], join = 'inner')
print(result)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
0  A2  B2
1  A3  B3
2  A4  B4


In [None]:
# 모든 열 합치기

import pandas as pd

# 데이터 프레임 생성
df1 = pd.DataFrame({
    'A':['A0', 'A1', 'A2'],
    'B':['B0', 'B1', 'B2']
})
df4 = pd.DataFrame({
    'A':['A2', 'A3', 'A4'],
    'B':['B2', 'B3', 'B4'],
    'C':['C2', 'C3', 'C4']
})

# 데이터 프레임 출력
print(df1)
print(df4)

# 두 데이터 프레임을 외부 결합하여 연결
result = pd.concat([df1,df4], join='outer')
print(result)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
    A   B   C
0  A2  B2  C2
1  A3  B3  C3
2  A4  B4  C4
    A   B    C
0  A0  B0  NaN
1  A1  B1  NaN
2  A2  B2  NaN
0  A2  B2   C2
1  A3  B3   C3
2  A4  B4   C4


In [None]:
print(df1)
print(df2)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
    A   B
0  A3  B3
1  A4  B4
2  A5  B5


In [None]:
result = pd.concat([df1,df2], keys = ['key1', 'key2'])
print(result)

         A   B
key1 0  A0  B0
     1  A1  B1
     2  A2  B2
key2 0  A3  B3
     1  A4  B4
     2  A5  B5


In [None]:
# key2에 해당하는 데이터 프레임의 2번째 3번째 행을 선택

df1_rows = result.loc['key2'].iloc[1:3]
print(df1_rows)

    A   B
1  A4  B4
2  A5  B5


In [2]:
import pandas as pd
# 팔머 펭귄 데이터 불러오기
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/penguins.csv')

In [None]:
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [None]:
print(df.tail())

    species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
339  Gentoo  Biscoe             NaN            NaN                NaN   
340  Gentoo  Biscoe            46.8           14.3              215.0   
341  Gentoo  Biscoe            50.4           15.7              222.0   
342  Gentoo  Biscoe            45.2           14.8              212.0   
343  Gentoo  Biscoe            49.9           16.1              213.0   

     body_mass_g     sex  
339          NaN     NaN  
340       4850.0  Female  
341       5750.0    Male  
342       5200.0  Female  
343       5400.0    Male  


In [None]:
print(df.describe())

       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
count      342.000000     342.000000         342.000000   342.000000
mean        43.921930      17.151170         200.915205  4201.754386
std          5.459584       1.974793          14.061714   801.954536
min         32.100000      13.100000         172.000000  2700.000000
25%         39.225000      15.600000         190.000000  3550.000000
50%         44.450000      17.300000         197.000000  4050.000000
75%         48.500000      18.700000         213.000000  4750.000000
max         59.600000      21.500000         231.000000  6300.000000


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
None


In [None]:
sorted_df = df.sort_values(by='bill_length_mm', ascending = False)
print(sorted_df.head())

       species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
253     Gentoo  Biscoe            59.6           17.0              230.0   
169  Chinstrap   Dream            58.0           17.8              181.0   
321     Gentoo  Biscoe            55.9           17.0              228.0   
215  Chinstrap   Dream            55.8           19.8              207.0   
335     Gentoo  Biscoe            55.1           16.0              230.0   

     body_mass_g     sex  
253       6050.0    Male  
169       3700.0  Female  
321       5600.0    Male  
215       4000.0    Male  
335       5850.0    Male  


In [None]:
# 'bill_length_mm'를 내림차순, 'bill_depth_mm'를 오름차순 정렬
sorted_df = df.sort_values(
    by = ['bill_length_mm', 'bill_depth_mm'],     #같은 length에서 depth를 오름차순 정렬
    ascending = [False, True]
)
print(sorted_df.head())

       species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
253     Gentoo  Biscoe            59.6           17.0              230.0   
169  Chinstrap   Dream            58.0           17.8              181.0   
321     Gentoo  Biscoe            55.9           17.0              228.0   
215  Chinstrap   Dream            55.8           19.8              207.0   
335     Gentoo  Biscoe            55.1           16.0              230.0   

     body_mass_g     sex  
253       6050.0    Male  
169       3700.0  Female  
321       5600.0    Male  
215       4000.0    Male  
335       5850.0    Male  


In [None]:
# 'bill_length_mm' 열에서 최댓값을 가지는 행의 인덱스를 찾기

max_idx = df['bill_length_mm'].idxmax()
print(f"Maximum bill length index: {max_idx}")

Maximum bill length index: 253


In [None]:
print(df.loc[max_idx])

species              Gentoo
island               Biscoe
bill_length_mm         59.6
bill_depth_mm          17.0
flipper_length_mm     230.0
body_mass_g          6050.0
sex                    Male
Name: 253, dtype: object


In [None]:
# 'bill_length_mm' 열에서 최솟값을 가지는 행의 인덱스를 찾기

min_idx = df['bill_length_mm'].idxmin()
print(f'Minimum bill length index: {min_idx}')
print(df.loc[min_idx])

Minimum bill length index: 142
species              Adelie
island                Dream
bill_length_mm         32.1
bill_depth_mm          15.5
flipper_length_mm     188.0
body_mass_g          3050.0
sex                  Female
Name: 142, dtype: object


In [None]:
# 'bill_length_mm' 열을 기준으로 데이터 프레임 정렬
sorted_df = df.sort_values(by = 'bill_length_mm', ascending=False)
print("Sorted DataFrame by bill_length_mm (Descending):")

# 정렬된 데이터 프레임의 첫 번째 행과 idxmax()로 찾은 행이 같은지 확인
sorted_max_value_row = sorted_df.iloc[0]
print("\nFirst row of the sorted DataFrame:")
print(sorted_max_value_row)

Sorted DataFrame by bill_length_mm (Descending):

First row of the sorted DataFrame:
species              Gentoo
island               Biscoe
bill_length_mm         59.6
bill_depth_mm          17.0
flipper_length_mm     230.0
body_mass_g          6050.0
sex                    Male
Name: 253, dtype: object


In [None]:
# 'bill_length_mm' 열에서 최댓값을 가지는 행의 인덱스를 찾기
max_idx = df['bill_length_mm'].idxmax()
max_value_row = df.loc[max_idx]
print(f"\nMaximum bill length index using idxmax(): {max_idx}")
print(max_value_row)


Maximum bill length index using idxmax(): 253
species              Gentoo
island               Biscoe
bill_length_mm         59.6
bill_depth_mm          17.0
flipper_length_mm     230.0
body_mass_g          6050.0
sex                    Male
Name: 253, dtype: object


In [None]:
# 최종 비교
comparison = max_value_row.equals(sorted_max_value_row)
print(f"Do the max vlaue rows match? {comparison}")

Do the max vlaue rows match? True


In [None]:
# 'species' 열을 기준으로 그룹화하여 평균 계산(수치형 열만 선택)
grouped_df = df.groupby('species').mean(numeric_only=True)
print(grouped_df)

           bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
species                                                                 
Adelie          38.791391      18.346358         189.953642  3700.662252
Chinstrap       48.833824      18.420588         195.823529  3733.088235
Gentoo          47.504878      14.982114         217.186992  5076.016260


In [3]:
# 데이터 프레임의 각 열의 평균 계산
print(df.mean(numeric_only=True))

bill_length_mm         43.921930
bill_depth_mm          17.151170
flipper_length_mm     200.915205
body_mass_g          4201.754386
dtype: float64


In [4]:
# 데이터 프레임의 각 열의 합계 계산
print(df.sum(numeric_only=True))

bill_length_mm         15021.3
bill_depth_mm           5865.7
flipper_length_mm      68713.0
body_mass_g          1437000.0
dtype: float64


In [5]:
# island별로 그룹화고 flipper_length_mm의 합계를 계산
grouped_df = df.groupby('island')['flipper_length_mm'].sum()
print("Grouped by island and summed fliiper_length_mm:")
print(grouped_df)

Grouped by island and summed fliiper_length_mm:
island
Biscoe       35021.0
Dream        23941.0
Torgersen     9751.0
Name: flipper_length_mm, dtype: float64


In [7]:
# flipper_length_mm 합계가 가장 큰 island 찾기
max_flipper_island = grouped_df.idxmax()
print(f"Island with maximum total fliiper length: {max_flipper_island}")
print(grouped_df.loc[max_flipper_island])

Island with maximum total fliiper length: Biscoe
35021.0


In [8]:
# island별로 그룹화하고 flipper_length_mm의 합계를 계산한 데이터 프레임 생성
grouped_sum_df = df.groupby('island', as_index = False)['flipper_length_mm'].sum()
print(grouped_sum_df)

      island  flipper_length_mm
0     Biscoe            35021.0
1      Dream            23941.0
2  Torgersen             9751.0


In [10]:
# flipper_length_mm 합계를 기준으로 내림차순 정렬
sorted_grouped_sum_df = grouped_sum_df.sort_values(by = 'flipper_length_mm', ascending = False)
print("Grouped and sorted DataFrame by total flipper_length_mm:\n")
print(sorted_grouped_sum_df)

Grouped and sorted DataFrame by total flipper_length_mm:

      island  flipper_length_mm
0     Biscoe            35021.0
1      Dream            23941.0
2  Torgersen             9751.0


In [12]:
# 예제 데이터 프레임 생성
df1 = pd.DataFrame({'key': ['A','B','C'], 'value':[1,2,3]})
df2 = pd.DataFrame({'key': ['A','B','D'], 'value':[4,5,6]})

# 두 데이터 프레임 병합
merged_df = pd.merge(df1, df2, on='key', how='inner')
print(merged_df)

  key  value_x  value_y
0   A        1        4
1   B        2        5


In [13]:
# 두 데이터 프레임을 outer join으로 통합
merged_df_outer = pd.merge(df1, df2, on='key', how='outer')
print(merged_df_outer)

  key  value_x  value_y
0   A      1.0      4.0
1   B      2.0      5.0
2   C      3.0      NaN
3   D      NaN      6.0


In [14]:
# 예제 데이터 프레임 생성
import pandas as pd
data = {
    'Date': ['2024-07-01', '2024-07-02', '2024-07-03', '2024-07-03'],
    'Temperature': [10,20,25,20],
    'Humidity': [60,65,70,21]
}
df = pd.DataFrame(data)
print(df)

         Date  Temperature  Humidity
0  2024-07-01           10        60
1  2024-07-02           20        65
2  2024-07-03           25        70
3  2024-07-03           20        21


In [15]:
df_melted = pd.melt(df,
                    id_vars = ['Date'],
                    value_vars = ['Temperature', 'Humidity'],
                    var_name = 'Variable',
                    value_name = 'Value')
print(df_melted)

         Date     Variable  Value
0  2024-07-01  Temperature     10
1  2024-07-02  Temperature     20
2  2024-07-03  Temperature     25
3  2024-07-03  Temperature     20
4  2024-07-01     Humidity     60
5  2024-07-02     Humidity     65
6  2024-07-03     Humidity     70
7  2024-07-03     Humidity     21


In [19]:
# pivot() 메서드는 각 행을 고유하게 식별할 수 있는 인덱스가 필요함. 'Date' 열은 중복 값 발생, 오류 발생
df_pivoted = df_melted.pivot(index ='Date',
                             columns = 'Variable',
                             values = 'Value').reset_index()
print(df_pivoted)

ValueError: Index contains duplicate entries, cannot reshape

In [20]:
df_melted2 = pd.melt(df.reset_index(),
                     id_vars = ['index'],
                     value_vars = ['Temperature', 'Humidity'],
                     var_name = 'Variable',
                     value_name = 'Value')
print(df_melted2)

   index     Variable  Value
0      0  Temperature     10
1      1  Temperature     20
2      2  Temperature     25
3      3  Temperature     20
4      0     Humidity     60
5      1     Humidity     65
6      2     Humidity     70
7      3     Humidity     21


In [22]:
# 넓은 형식으로 재정리(오류 해결)
pf_pivoted = df_melted2.pivot(index = 'index',
                              columns = 'Variable',
                              values = 'Value')
print(pf_pivoted)

Variable  Humidity  Temperature
index                          
0               60           10
1               65           20
2               70           25
3               21           20


In [31]:
# df_melted dataframe으로 pivot_table 만들기
df_pivot_table = df_melted.pivot_table(index = 'Date',
                                       columns = 'Variable',
                                       values = 'Value').reset_index()
print(df_pivot_table)

Variable        Date  Humidity  Temperature
0         2024-07-01      60.0         10.0
1         2024-07-02      65.0         20.0
2         2024-07-03      45.5         22.5


In [32]:
import pandas as pd
# 학생 성적 데이터 불러오기
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/dat.csv')

In [33]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   school    366 non-null    object 
 1   sex       366 non-null    object 
 2   paid      366 non-null    object 
 3   famrel    366 non-null    int64  
 4   freetime  366 non-null    int64  
 5   goout     356 non-null    float64
 6   Dalc      366 non-null    int64  
 7   Walc      366 non-null    int64  
 8   health    366 non-null    int64  
 9   absences  366 non-null    int64  
 10  grade     366 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 31.6+ KB
None


In [34]:
df = df.rename(columns = {'Dalc':'dalc', 'Walc':'walc'})
print(df.columns)

Index(['school', 'sex', 'paid', 'famrel', 'freetime', 'goout', 'dalc', 'walc',
       'health', 'absences', 'grade'],
      dtype='object')


In [35]:
# astype()을 활용하여 데이터 타입 바꾸기
print(df.astype({'famrel':'object', 'dalc':'float64'}).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   school    366 non-null    object 
 1   sex       366 non-null    object 
 2   paid      366 non-null    object 
 3   famrel    366 non-null    object 
 4   freetime  366 non-null    int64  
 5   goout     356 non-null    float64
 6   dalc      366 non-null    float64
 7   walc      366 non-null    int64  
 8   health    366 non-null    int64  
 9   absences  366 non-null    int64  
 10  grade     366 non-null    int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 31.6+ KB
None


In [37]:
def classify_famrel(famrel):
  if famrel <= 2:
    return 'Low'
  elif famrel <=4 :
    return 'Medium'
  else:
    return 'High'

In [40]:
df1 = df.copy()
df1 = df1.assign(famrel_quality = df1['famrel'].apply(classify_famrel))

print(df1[['famrel', 'famrel_quality']].head())

   famrel famrel_quality
0       4         Medium
1       5           High
2       4         Medium
3       3         Medium
4       4         Medium


In [41]:
df2 = df.copy()
df2 = df2.assign(famrel = df2['famrel'].apply(classify_famrel))

print(df2[['famrel']].head())

   famrel
0  Medium
1    High
2  Medium
3  Medium
4  Medium


In [42]:
df3 = df.copy()
df3['famrel'] = df3['famrel'].apply(classify_famrel)

print(df3[['famrel']].head())

   famrel
0  Medium
1    High
2  Medium
3  Medium
4  Medium


In [44]:
print(df.select_dtypes('number').head(2))

   famrel  freetime  goout  dalc  walc  health  absences  grade
0       4         3    4.0     1     1       3         6      1
1       5         3    3.0     1     1       3         4      1


In [45]:
print(df.select_dtypes('object').head(2))

  school sex paid
0     GP   F   no
1     GP   F   no


In [47]:
# 임의의 함수를 정의하고 수치형 칼럼에 적용
import numpy as np

def standardize(x):
  return (x - np.nanmean(x)/np.std(x))

df.select_dtypes('number').apply(standardize).head(2)

Unnamed: 0,famrel,freetime,goout,dalc,walc,health,absences,grade
0,-0.41557,-0.242302,1.192457,-0.677095,-0.789321,0.408977,5.310415,-0.639516
1,0.58443,-0.242302,0.192457,-0.677095,-0.789321,0.408977,3.310415,-0.639516


In [48]:
print(df.columns)

Index(['school', 'sex', 'paid', 'famrel', 'freetime', 'goout', 'dalc', 'walc',
       'health', 'absences', 'grade'],
      dtype='object')


In [51]:
# 칼럼명 패턴을 활용하여 특정 칼럼 선택하기

print(df.columns.str.startswith('f'))

[False False False  True  True False False False False False False]


In [54]:
print(df.loc[:, df.columns.str.startswith('f')].head())

   famrel  freetime
0       4         3
1       5         3
2       4         3
3       3         2
4       4         3


In [56]:
print(df.columns.str.endswith('c'))

[False False False False False False  True  True False False False]


In [57]:
print(df.loc[:, df.columns.str.endswith('c')].head())

   dalc  walc
0     1     1
1     1     1
2     2     3
3     1     1
4     1     2


In [58]:
print(df.columns.str.contains('f'))

[False False False  True  True False False False False False False]


In [62]:
print(df.loc[:,df.columns.str.contains('f')].head())

   famrel  freetime
0       4         3
1       5         3
2       4         3
3       3         2
4       4         3


In [63]:
print(df.loc[:,df.columns.str.contains('f')].apply(standardize).head())

    famrel  freetime
0 -0.41557 -0.242302
1  0.58443 -0.242302
2 -0.41557 -0.242302
3 -1.41557 -1.242302
4 -0.41557 -0.242302
